diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cuda.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cuda.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48e0e51a793cd861c2346fa4682ef23554d6c3eb Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cuda.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/cuda/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/cuda/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f7c32fc9ef2d7970affae52c1e6723beb52e293 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/cuda/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/cuda/__pycache__/device_op_overrides.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/cuda/__pycache__/device_op_overrides.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4412a4d2ea0fc160a31f9d59ee0afebaff771bf4 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/cuda/__pycache__/device_op_overrides.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/cuda/cutlass_utils.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/cuda/cutlass_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..203eaef35d3ac024c9c9cfc316a88d24b13107d5 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/cuda/cutlass_utils.py @@ -0,0 +1,258 @@ +import functools +import logging +import os +import sys +from dataclasses import dataclass +from typing import Any, List, Optional + +import sympy + +import torch + +from ...codecache import cache_dir +from ...config import cuda as inductor_cuda_config +from ...ir import Layout +from .cuda_env import get_cuda_arch, get_cuda_version + +log = logging.getLogger(__name__) + + +def _rename_cutlass_import(content: str, cutlass_modules: List[str]) -> str: + for cutlass_module in cutlass_modules: + content = content.replace( + f"from {cutlass_module} import ", + f"from cutlass_library.{cutlass_module} import ", + ) + return content + + +def _gen_cutlass_file( + file_name: str, cutlass_modules: List[str], src_dir: str, dst_dir: str +) -> None: + orig_full_path = os.path.abspath(os.path.join(src_dir, file_name)) + text = "" + with open(orig_full_path) as f: + text = f.read() + text = _rename_cutlass_import(text, cutlass_modules) + dst_full_path = os.path.abspath( + os.path.join( + dst_dir, + file_name, + ) + ) + with open(dst_full_path, "w") as f: + f.write(text) + + +@functools.lru_cache(None) +def try_import_cutlass() -> bool: + # Copy CUTLASS python scripts to a temp dir and add the temp dir to Python search path. + # This is a temporary hack to avoid CUTLASS module naming conflicts. + # TODO(ipiszy): remove this hack when CUTLASS solves Python scripts packaging structure issues. + + cutlass_py_full_path = os.path.abspath( + os.path.join(inductor_cuda_config.cutlass_dir, "python/cutlass_library") + ) + tmp_cutlass_py_full_path = os.path.abspath( + os.path.join(cache_dir(), "torch_cutlass_library") + ) + dst_link = os.path.join(tmp_cutlass_py_full_path, "cutlass_library") + + if os.path.isdir(cutlass_py_full_path): + if tmp_cutlass_py_full_path not in sys.path: + if os.path.exists(dst_link): + assert os.path.islink( + dst_link + ), f"{dst_link} is not a symlink. Try to remove {dst_link} manually and try again." + assert os.path.realpath(os.readlink(dst_link)) == os.path.realpath( + cutlass_py_full_path + ), f"Symlink at {dst_link} does not point to {cutlass_py_full_path}" + else: + os.makedirs(tmp_cutlass_py_full_path, exist_ok=True) + os.symlink(cutlass_py_full_path, dst_link) + sys.path.append(tmp_cutlass_py_full_path) + try: + import cutlass_library.generator # noqa: F401 + import cutlass_library.library # noqa: F401 + import cutlass_library.manifest # noqa: F401 + + return True + + except ImportError as e: + log.debug( + "Failed to import CUTLASS packages: %s, ignoring the CUTLASS backend.", + str(e), + ) + else: + log.debug( + "Failed to import CUTLASS packages: CUTLASS repo does not exist: %s", + cutlass_py_full_path, + ) + return False + + +def _normalize_cuda_arch(arch: str) -> str: + if int(arch) >= 90: + return "90" + elif int(arch) >= 80: + return "80" + elif int(arch) >= 75: + return "75" + elif int(arch) >= 70: + return "70" + else: + raise NotImplementedError(f"Unsupported cuda arch: {arch}") + + +@dataclass +class CUTLASSArgs: + """ + CUTLASS args used to initialize a CUTLASS Manifest. + """ + + architectures: Optional[str] = None + cuda_version: Optional[str] = None + + operations = "all" + build_dir = "" + curr_build_dir = "" + generator_target = "" + kernels = "all" + ignore_kernels = "" + # TODO: these three look dead? + kernel_filter_file: None = None + selected_kernel_list: None = None + interface_dir: None = None + filter_by_cc = True + disable_full_archs_compilation = False + + def __post_init__(self): + if self.architectures is None or self.cuda_version is None: + raise RuntimeError( + f"{self.architectures=} or {self.cuda_version=} is None!" + ) + self.architectures = _normalize_cuda_arch(self.architectures) + + +@functools.lru_cache(None) +def _gen_ops_cached(arch, version) -> List[Any]: + # Note: Cache needs to be specific for cuda architecture and version + + # Import cutlass python scripts. + assert try_import_cutlass() + import cutlass_library.generator as cutlass_generator + import cutlass_library.manifest as cutlass_manifest + + if arch is None or version is None: + log.error( + "Cannot detect cuda arch %s or cuda version %s. " + "Will discard all cutlass ops. " + "Please consider setting _inductor.cuda.arch and _inductor.cuda.version configs.", + arch, + version, + ) + return list() + arch = _normalize_cuda_arch(arch) + args = CUTLASSArgs(architectures=arch, cuda_version=version) + manifest = cutlass_manifest.Manifest(args) + + if arch == "90": + cutlass_generator.GenerateSM90(manifest, args.cuda_version) + cutlass_generator.GenerateSM80(manifest, args.cuda_version) + else: + try: + func = getattr(cutlass_generator, "GenerateSM" + arch) + func(manifest, args.cuda_version) + except AttributeError as e: + raise NotImplementedError( + "Arch " + arch + " is not supported by current cutlass lib." + ) from e + return manifest.operations + + +def gen_ops() -> List[Any]: + """ + Generates all supported CUTLASS operations. + """ + arch = get_cuda_arch() + version = get_cuda_version() + return _gen_ops_cached(arch, version) + + +def dtype_match( + torch_dtype: Optional[torch.dtype], + cutlass_dtype: "cutlass_library.library.DataType", # type: ignore[name-defined] # noqa: F821 +) -> bool: + # Import cutlass python scripts. + assert try_import_cutlass() + import cutlass_library + + if torch_dtype == torch.float: + return ( + cutlass_dtype == cutlass_library.library.DataType.f32 + or cutlass_dtype == cutlass_library.library.DataType.tf32 + ) + elif torch_dtype == torch.half: + return cutlass_dtype == cutlass_library.library.DataType.f16 + elif torch_dtype == torch.bfloat16: + return cutlass_dtype == cutlass_library.library.DataType.bf16 + else: + return False + + +def get_accumulator_dtype( + input_torch_dtypes: List[torch.dtype], +) -> Optional[torch.dtype]: + """ + Given a list of input torch dtypes, returns the inferred accumulator torch dtype. + """ + + if len(input_torch_dtypes) == 0: + return None + torch_dtype = input_torch_dtypes[0] + for dtype in input_torch_dtypes[1:]: + if torch_dtype != dtype: + raise RuntimeError(f"Unmatched input dtypes: {torch_dtype=}, {dtype=}") + if torch_dtype == torch.half: + if torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction: + return torch_dtype + else: + return torch.float + if torch_dtype in {torch.bfloat16, torch.float}: + return torch.float + raise NotImplementedError(f"Unsupported data type: {input_torch_dtypes=}") + + +def get_alignments(torch_dtype: torch.dtype) -> List[int]: + """ + Returns all possible valid CUTLASS alignments in terms of the number of elements for a given dtype. + CUTLASS gemm / conv SM80 APIs support 16 bytes max alignment, and 2 bytes min alignment. + """ + + if torch_dtype in (torch.half, torch.bfloat16): + return [8, 4, 2, 1] + elif torch_dtype == torch.float: + return [4, 2, 1] + else: + raise NotImplementedError(f"unsupported {torch_dtype=} for alignments") + + +def get_max_alignment(inductor_layout: Layout) -> int: + """ + Returns the max alignment (in terms of number of elements) for a given Inductor Layout. + """ + + dtype = inductor_layout.dtype + size = inductor_layout.size + offset = inductor_layout.offset + + def is_static_int(number): + return isinstance(number, (int, sympy.Integer)) + + if is_static_int(size[-1]) and is_static_int(offset): + alignments = get_alignments(dtype) + for alignment in alignments: + if int(size[-1]) % alignment == 0 and int(offset) % alignment == 0: + return alignment + + return 1 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/triton.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/triton.py new file mode 100644 index 0000000000000000000000000000000000000000..e0306cf2d389f35cfca0d949b339ed629162ab2e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/triton.py @@ -0,0 +1,3931 @@ +from __future__ import annotations + +import collections +import contextlib +import dataclasses +import functools +import itertools +import logging +import math +import operator +import os +import textwrap +from functools import lru_cache +from typing import ( + Any, + Callable, + cast, + Counter, + DefaultDict, + Dict, + Iterable, + List, + Optional, + Set, + Tuple, + Union, +) + +import sympy + +import torch +import torch._logging + +from torch._inductor.metrics import is_metric_table_enabled, log_kernel_metadata +from torch._prims_common import is_integer_dtype +from torch.utils._sympy.functions import FloorDiv, ModularIndexing +from torch.utils._sympy.value_ranges import ValueRanges +from torch.utils._triton import has_triton_package + +from ..._dynamo.utils import counters +from .. import config, ir, scheduler +from ..codecache import code_hash, get_path, PyCodeCache +from ..dependencies import Dep, MemoryDep, StarDep, WeakDep +from ..ir import IRNode, ReductionHint, TritonTemplateBuffer +from ..optimize_indexing import indexing_dtype_strength_reduction +from ..scheduler import BaseSchedulerNode, BaseScheduling, WhyNoFuse +from ..triton_heuristics import AutotuneHint +from ..utils import ( + cache_on_self, + do_bench, + get_dtype_size, + get_fused_kernel_name, + get_kernel_metadata, + get_max_y_grid, + green_text, + is_welford_reduction, + next_power_of_2, + Placeholder, + sympy_dot, + sympy_index_symbol, + sympy_product, + sympy_subs, + unique, + yellow_text, +) +from ..virtualized import _ops as ops, OpsHandler, ReductionType, StoreMode, V +from ..wrapper_benchmark import get_kernel_category_by_source_code +from .common import ( + CSE, + CSEVariable, + DeferredLine, + free_symbol_startswith, + IndentedBuffer, + index_prevent_reordering, + Kernel, + OpOverrides, + PythonPrinter, + SizeArg, + TensorArg, +) +from .multi_kernel import MultiKernel +from .triton_utils import config_of, signature_of, signature_to_meta + +log = logging.getLogger(__name__) +perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints") +schedule_log = torch._logging.getArtifactLogger(__name__, "schedule") +fusion_log = torch._logging.getArtifactLogger(__name__, "fusion") + + +@lru_cache(None) +def gen_attr_descriptor_import(): + """ + import AttrsDescriptor if the triton version is new enough to have this + class defined. + """ + if not has_triton_package(): + return "" + + import triton.compiler.compiler + + if hasattr(triton.compiler.compiler, "AttrsDescriptor"): + return "from triton.compiler.compiler import AttrsDescriptor" + else: + return "" + + +@lru_cache(None) +def gen_common_triton_imports(): + imports = IndentedBuffer() + imports.splice( + """ + import triton + import triton.language as tl + """ + ) + if attr_desc := gen_attr_descriptor_import(): + imports.writeline(attr_desc) + + imports.splice( + """ + from torch._inductor import triton_helpers, triton_heuristics + from torch._inductor.ir import ReductionHint, TileHint + from torch._inductor.triton_helpers import libdevice, math as tl_math + from torch._inductor.triton_heuristics import AutotuneHint + from torch._inductor.utils import instance_descriptor + """ + ) + return imports.getvalue() + + +@dataclasses.dataclass +class IndexingOptions: + index_str: str + mask_vars: Set[sympy.Symbol] + mask_str: str + expand_str: Optional[str] + _has_rindex: bool + + def has_mask(self): + return bool(self.mask_vars) + + def has_rindex(self): + return self._has_rindex + + def has_tmpmask(self): + return "tmp" in self.mask_str + + def has_rmask(self): + return "rmask" in self.mask_str + + +@dataclasses.dataclass +class BlockPtrOptions: + constant_offset: sympy.Expr + shape: List[sympy.Expr] + strides: List[sympy.Expr] + block_shape: List[str] + order: List[int] + offsets: List[str] + mask_vars: Set[sympy.Symbol] + reshape_suffix: List[str] + + @staticmethod + def create( + strides: List[sympy.Expr], + constant_offset: sympy.Expr, + range_trees: List[IterationRangesEntry], + mask_vars: Set[sympy.Symbol], + ) -> BlockPtrOptions: + """Helper to create a BlockPtrOptions instance""" + block_shape = [f"{t.prefix.upper()}BLOCK" for t in range_trees] + reshape_suffix = [*block_shape] + + broadcasting_dim = [s == 0 for s in strides] + for i, is_broadcasting in enumerate(broadcasting_dim): + if is_broadcasting: + # drop any stride==0 dimensions for performance + reshape_suffix[i] = "1" + + if V.kernel.no_x_dim: + assert range_trees[0].prefix == "x" + reshape_suffix.pop(0) + + if ( + not V.kernel.inside_reduction + and len(strides) == len(V.kernel.numels) - 1 + and V.kernel.numels[-1] != 1 + ): + # Need to expand rank by 1 to match rank when self.inside_reduction=True + reshape_suffix.append("1") + + def filter(it): + """Removes any broadcasting dims from a given sequence""" + assert len(it) == len(broadcasting_dim) + return [ + item + for item, is_broadcasting in zip(it, broadcasting_dim) + if not is_broadcasting + ] + + return BlockPtrOptions( + constant_offset=V.graph.sizevars.lookup_precomputed_size(constant_offset), + shape=[ + V.graph.sizevars.lookup_precomputed_size(t.numel) + for t in filter(range_trees) + ], + strides=[*map(V.graph.sizevars.lookup_precomputed_size, filter(strides))], + block_shape=filter(block_shape), + order=V.graph.sizevars.guarded_order(filter(strides)), + offsets=filter([f"{t.prefix}offset" for t in range_trees]), + mask_vars=mask_vars, + reshape_suffix=reshape_suffix, + ) + + def format(self, name: str, roffset=True) -> str: + """ + Codegen a call to tl.make_block_ptr() + + Args: + name: variable name for pointer + roffset: should roffset be included in offsets=..., for use with tl.advance() + + Returns: + "tl.make_block_ptr(...)" + """ + f = V.kernel.index_to_str + offsets = [*self.offsets] + if not roffset: + offsets[offsets.index("roffset")] = "0" + args = [ + f"{name} + ({f(self.constant_offset)})" + if self.constant_offset != 0 + else name, + f"shape={f(self.shape)}", + f"strides={f(self.strides)}", + f"block_shape={f(self.block_shape)}", + f"order={f(self.order)}", + f"offsets={f(offsets)}", + ] + return f"tl.make_block_ptr({', '.join(args)})" + + @cache_on_self + def boundary_check(self) -> List[int]: + """List of indices to pass to tl.load(boundary_check=...)""" + check = [] + for i in range(len(self.shape)): + if ( + self.block_shape[i] != "1" + and not V.graph.sizevars.statically_known_equals(self.strides[i], 0) # type: ignore[arg-type] + and not V.graph.sizevars.statically_known_multiple_of( + self.shape[i], + config.triton.max_block[self.block_shape[i][0]], # type: ignore[arg-type] + ) + and not (V.kernel.no_x_dim and self.block_shape[i] == "XBLOCK") + ): + check.append(i) + return check + + def advance_roffset(self): + """Codegen string to pass to tl.advance(name, ...)""" + advance = ["0"] * len(self.shape) + advance[self.offsets.index("roffset")] = "RBLOCK" + return V.kernel.index_to_str(advance) + + def has_rindex(self): + return "RBLOCK" in self.block_shape + + def has_rmask(self): + return self.has_rindex() + + def has_tmpmask(self): + return False # block_ptr can't do indirect indexing + + def has_mask(self): + return bool(self.boundary_check()) + + +def triton_reshape(value: str, old_shape: List[str], new_shape: List[str]): + """Workaround https://github.com/openai/triton/issues/2836""" + assert isinstance(old_shape, list) and isinstance(new_shape, list) + if old_shape == new_shape: + return value + if [s for s in new_shape if s != "1"] != old_shape: + return f"tl.reshape({value}, [{', '.join(new_shape)}])" + # rewrite to [:, None] syntax, which is less buggy + idx = 0 + expand = [] + for size in new_shape: + if idx < len(old_shape) and size == old_shape[idx]: + expand.append(":") + idx += 1 + else: + assert size == "1" + expand.append("None") + assert idx == len(old_shape) + return f"{value}[{', '.join(expand)}]" + + +class TritonPrinter(PythonPrinter): + def _print_floor(self, expr): + assert len(expr.args) == 1 + return ( + f"libdevice.floor({self._print(expr.args[0])}).to({V.kernel.index_dtype})" + ) + + def _print_ceiling(self, expr): + assert len(expr.args) == 1 + return f"libdevice.ceil({self._print(expr.args[0])}).to({V.kernel.index_dtype})" + + def _helper_sqrt(self, expr): + return f"libdevice.sqrt({self._print(expr)}.to(tl.float32))" + + def _print_Where(self, expr): + c = self.doprint(expr.args[0]) + p = self.doprint(expr.args[1]) + q = self.doprint(expr.args[2]) + return f"tl.where({c}, {p}, {q})" + + def _print_Min(self, expr): + nargs = len(expr.args) + if len(expr.args) == 1: + return self._print(expr.args[0]) + + mid = len(expr.args) // 2 + a = self._print(sympy.Min(*expr.args[:mid])) + b = self._print(sympy.Min(*expr.args[mid:])) + return f"tl.minimum({a}, {b})" + + def _print_Max(self, expr): + nargs = len(expr.args) + if len(expr.args) == 1: + return self._print(expr.args[0]) + + mid = len(expr.args) // 2 + a = self._print(sympy.Max(*expr.args[:mid])) + b = self._print(sympy.Max(*expr.args[mid:])) + + return f"tl.maximum({a}, {b})" + + def _print_Abs(self, expr): + assert len(expr.args) == 1 + return f"tl_math.abs({self._print(expr.args[0])})" + + def _print_cos(self, expr): + assert len(expr.args) == 1 + return f"libdevice.cos(({self._print(expr.args[0])}).to(tl.float32))" + + def _print_cosh(self, expr): + assert len(expr.args) == 1 + return f"libdevice.cosh(({self._print(expr.args[0])}).to(tl.float32))" + + def _print_acos(self, expr): + assert len(expr.args) == 1 + return f"libdevice.acos(({self._print(expr.args[0])}).to(tl.float32))" + + def _print_sin(self, expr): + assert len(expr.args) == 1 + return f"libdevice.sin(({self._print(expr.args[0])}).to(tl.float32))" + + def _print_sinh(self, expr): + assert len(expr.args) == 1 + return f"libdevice.sinh(({self._print(expr.args[0])}).to(tl.float32))" + + def _print_asin(self, expr): + assert len(expr.args) == 1 + return f"libdevice.asin(({self._print(expr.args[0])}).to(tl.float32))" + + def _print_tan(self, expr): + assert len(expr.args) == 1 + return f"libdevice.tan(({self._print(expr.args[0])}).to(tl.float32))" + + def _print_tanh(self, expr): + assert len(expr.args) == 1 + return f"libdevice.tanh(({self._print(expr.args[0])}).to(tl.float32))" + + def _print_atan(self, expr): + assert len(expr.args) == 1 + return f"libdevice.atan(({self._print(expr.args[0])}).to(tl.float32))" + + def _print_FloorDiv(self, expr): + if expr.is_integer: + return super()._print_FloorDiv(expr) + + x, div = expr.args + x = self.paren(self.doprint(x)) + div = self.paren(self.doprint(div)) + return f"libdevice.floor({x} / {div}).to({V.kernel.index_dtype})" + + def _print_Round(self, expr): + assert len(expr.args) == 1 + return ( + f"libdevice.llrint({self._print(expr.args[0])}).to({V.kernel.index_dtype})" + ) + + def _print_RoundDecimal(self, expr): + assert len(expr.args) == 2 + number, ndigits = expr.args + if number.is_integer: + # ndigits < 0 should have been filtered by the sympy function + assert ndigits < 0 + raise ValueError( + f"For integer inputs, only non-negative ndigits are currently supported, but got {ndigits}." + ) + return f"libdevice.nearbyint(1e{ndigits} * {self.paren(self._print(number))}) * 1e{-ndigits}" + + +texpr = TritonPrinter().doprint +pexpr = PythonPrinter().doprint + + +def triton_compute_type(dtype): + triton_type_name = str(dtype).split(".")[-1] + if triton_type_name == "bool": + triton_type_name = "int1" + elif triton_type_name in ("float16", "bfloat16"): + # float16 math is done in float32 inside the kernel + triton_type_name = "float32" + elif triton_type_name == "float8_e4m3fn": + triton_type_name = "float8e4nv" + elif triton_type_name == "float8_e5m2": + triton_type_name = "float8e5" + elif triton_type_name == "float8_e4m3fnuz": + triton_type_name = "float8e4b8" + elif triton_type_name == "float8_e5m2": + triton_type_name = "float8e5b16" + return f"tl.{triton_type_name}" + + +def triton_store_type(dtype): + triton_type_name = str(dtype).split(".")[-1] + if triton_type_name == "bool": + triton_type_name = "int8" + elif triton_type_name == "float8_e4m3fn": + triton_type_name = "float8e4nv" + elif triton_type_name == "float8_e5m2": + triton_type_name = "float8e5" + return f"tl.{triton_type_name}" + + +def triton_acc_type(dtype): + if is_integer_dtype(dtype) and dtype.is_signed: + nbits = 64 if dtype == torch.int64 else 32 + return f"tl.int{nbits}" + return triton_compute_type(dtype) + + +def triton_constant(value): + if value == float("inf"): + return 'float("inf")' + elif value == float("-inf"): + return 'float("-inf")' + elif math.isnan(value): + return 'float("nan")' + return repr(value) + + +class TritonCSEVariable(CSEVariable): + def __init__(self, name, bounds: ValueRanges[Any]): + super().__init__(name, bounds) + # We'll use this to track which masks the variable needs when used for indirect indexing + self.mask_vars: Set[str] = set() + + def update_on_args(self, name, args, kwargs): + # When making a variable that is going to be used in indirect indexing + # if a where clause is used it should mean that the result is always a + # valid index, so you shouldn't include any of the dependent variables + # in the resulting load mask + if name == "where": + return + for arg in args: + if isinstance(arg, TritonCSEVariable): + self.mask_vars.update(arg.mask_vars) + elif isinstance(arg, sympy.Symbol) and arg.name[0] in "xyr": + # most of the time index vars don't need masks associated with them + # however, when index vars are used to compute indices for indirect reads + # those reads should subsequently be masked, + self.mask_vars.update({f"{arg.name[0]}mask"}) + + def __repr__(self): + return f"TritonCSEVariable(name={self.name})" + + +class TritonOverrides(OpOverrides): + """Map element-wise ops to Triton""" + + @staticmethod + def to_dtype(x, dtype: torch.dtype, src_dtype: Optional[torch.dtype] = None): + def _get_min_elements_per_thread( + src_dtype: torch.dtype, dst_dtype: torch.dtype + ) -> int: + if src_dtype == dst_dtype: + # No data type conversion is needed. No requirements on min_elem_per_thread. + return 0 + + # fp8 data type conversions has min_elem_per_thread requirements. + # Refer to Triton implementations here: + # https://github.com/openai/triton/blob/10f59d8ce04052521c1bc0cb3a3f8b98918fc7e3/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp#L10. + fp8_dtypes = { + torch.float8_e4m3fn, + torch.float8_e5m2, + } + # Triton doesn't support type conversions between fp8_e4m3 and fp8_e5m2. + assert not ( + src_dtype in fp8_dtypes + and dst_dtype in fp8_dtypes + and src_dtype != dst_dtype + ), "Conversions between float8_e5m2 and float8_e4m3fn is not supported!" + if src_dtype == torch.float8_e5m2 or dst_dtype == torch.float8_e5m2: + return 4 + if src_dtype == torch.float8_e4m3fn or dst_dtype == torch.float8_e4m3fn: + return 2 + # No requirements on min_elem_per_thread. + return 0 + + if src_dtype is not None: + # Both dtype and src_dtype are set. This is used by torch to(dtype=dtype). + # It takes the maximum min_elem_per_thread if there are multiple fp8 conversions + # in the same kernel. + V.kernel.min_elem_per_thread = max( + _get_min_elements_per_thread(src_dtype, dtype), + V.kernel.min_elem_per_thread, + ) + + if dtype == torch.bool: + return f"({x} != 0)" + elif dtype == torch.uint8: + # to work around llvm uint conversion semantics + # that produces 0's for negative values + return f"{x}.to(tl.int8).to(tl.uint8)" + return f"{x}.to({triton_compute_type(dtype)})" + + @staticmethod + def to_dtype_bitcast(x, dtype: torch.dtype, src_dtype: torch.dtype): + triton_dtype = triton_compute_type(dtype) + # We may promote float16 or bfloat16 to float32 and cause the + # bitwidth of dtype to be different from the input tensor (i.e. float32). + # In such as case, we will have to convert the input tensor to + # its src_type, perform bitcast, and then convert the bit-casted + # tensor back to float to ensure we use values with the right precision. + if src_dtype in (torch.float16, torch.bfloat16): + triton_src_dtype = str(src_dtype).split(".")[-1] + cast_x = f"{x}.to(tl.{triton_src_dtype})" + cast_x = f"{cast_x}.to({triton_dtype}, bitcast=True)" + return f"{cast_x}.to(tl.float32)" + else: + return f"{x}.to({triton_dtype}, bitcast=True)" + + @staticmethod + def _shaped_constant(value, dtype, shape): + type_ = torch._prims_common.dtype_to_type(dtype) + triton_val = triton_constant(type_(value)) + triton_type = triton_compute_type(dtype) + + if triton_type == "tl.float32": + # Float constants are always f32 in triton + return triton_val + + # NOTE: We use a tensor here in order to get the expected type. + # Otherwise, e.g. float64 constants would be trunctated to float32. + return f"tl.full({shape}, {triton_val}, {triton_type})" + + @classmethod + def constant(cls, value, dtype): + return cls._shaped_constant(value, dtype, shape=[]) + + @staticmethod + def abs(x): + return f"tl_math.abs({x})" + + @staticmethod + def libdevice_abs(x): + return f"libdevice.abs({x})" + + @staticmethod + def exp(x): + return f"tl_math.exp({x})" + + @staticmethod + def libdevice_exp(x): + return f"libdevice.exp({x})" + + @staticmethod + def exp2(x): + return f"libdevice.exp2({x})" + + @staticmethod + def expm1(x): + return f"libdevice.expm1({x})" + + @staticmethod + def sqrt(x): + return f"libdevice.sqrt({x})" + + @staticmethod + def libdevice_sqrt(x): + return f"libdevice.sqrt({x})" + + @staticmethod + def relu(x): + bug = config.triton.inject_relu_bug_TESTING_ONLY + if bug == "compile_error": + return "compile error!" + elif bug == "runtime_error": + # NB: this only triggers runtime error as long as input + # is not all zero + return f'triton_helpers.device_assert_then({x} == 0, "injected assert fail", {x})' + elif bug == "accuracy": + return f"{x} + 1" + elif bug is None: + return ops.maximum("0", x) + else: + raise AssertionError( + f"unrecognized config triton.inject_relu_bug_TESTING_ONLY = {bug!r}" + ) + + @staticmethod + def minimum(a, b): + return f"triton_helpers.minimum({a}, {b})" + + @staticmethod + def maximum(a, b): + return f"triton_helpers.maximum({a}, {b})" + + @staticmethod + def where(a, b, c): + return f"tl.where({a}, {b}, {c})" + + @staticmethod + def cos(x): + return f"tl_math.cos({x})" + + @staticmethod + def libdevice_cos(x): + return f"libdevice.cos({x})" + + @staticmethod + def sin(x): + return f"tl_math.sin({x})" + + @staticmethod + def libdevice_sin(x): + return f"libdevice.sin({x})" + + @classmethod + def index_expr(cls, expr, dtype): + raise NotImplementedError("ops.index_expr not implemented outside a kernel") + + @staticmethod + def masked(mask, body, other): + raise NotImplementedError("ops.masked not implemented outside a kernel") + + @staticmethod + def lgamma(x): + return f"libdevice.lgamma({x})" + + @staticmethod + def erf(x): + return f"libdevice.erf({x})" + + @staticmethod + def cosh(x): + return f"libdevice.cosh({x})" + + @staticmethod + def sinh(x): + return f"libdevice.sinh({x})" + + @staticmethod + def acos(x): + return f"libdevice.acos({x})" + + @staticmethod + def acosh(x): + return f"libdevice.acosh({x})" + + @staticmethod + def asin(x): + return f"libdevice.asin({x})" + + @staticmethod + def asinh(x): + return f"libdevice.asinh({x})" + + @staticmethod + def atan2(x, y): + return f"libdevice.atan2({x}, {y})" + + @staticmethod + def atan(x): + return f"libdevice.atan({x})" + + @staticmethod + def atanh(x): + return f"libdevice.atanh({x})" + + @staticmethod + def copysign(x, y): + return f"libdevice.copysign({x}, {y})" + + @staticmethod + def erfc(x): + return f"libdevice.erfc({x})" + + @staticmethod + def erfinv(x): + return f"libdevice.erfinv({x})" + + @staticmethod + def hypot(x, y): + return f"libdevice.hypot({x}, {y})" + + @staticmethod + def log10(x): + return f"libdevice.log10({x})" + + @staticmethod + def nextafter(x, y): + return f"libdevice.nextafter({x}, {y})" + + @staticmethod + def logical_and(a, b): + return f"{a} & {b}" + + @staticmethod + def logical_not(a): + return f"{a} == 0" + + @staticmethod + def logical_or(a, b): + return f"{a} | {b}" + + @staticmethod + def logical_xor(a, b): + return f"({a} ^ {b})" + + @staticmethod + def bitwise_and(a, b): + return f"{a} & {b}" + + @staticmethod + def bitwise_not(a): + return f"~{a}" + + @staticmethod + def bitwise_or(a, b): + return f"{a} | {b}" + + @staticmethod + def bitwise_xor(a, b): + return f"{a} ^ {b}" + + @staticmethod + def bitwise_left_shift(a, b): + return f"{a} << {b}" + + @staticmethod + def bitwise_right_shift(a, b): + return f"{a} >> {b}" + + @staticmethod + def rand(seed, offset): + offset = f"({offset}).to(tl.uint32)" + return f"tl.rand({seed}, {offset})" + + @staticmethod + def randn(seed, offset): + offset = f"({offset}).to(tl.uint32)" + return f"tl.randn({seed}, {offset})" + + @staticmethod + def randint64(seed, offset, low, high): + offset = f"({offset}).to(tl.uint32)" + return f"triton_helpers.randint64({seed}, {offset}, {low}, {high})" + + @staticmethod + def load_seed(name, offset): + raise NotImplementedError("ops.load_seed not implemented outside a kernel") + + @staticmethod + def rsqrt(x): + return f"libdevice.rsqrt({x})" + + @staticmethod + def log1p(x): + return f"libdevice.log1p({x})" + + @staticmethod + def tan(x): + return f"libdevice.tan({x})" + + @staticmethod + def tanh(x): + return f"libdevice.tanh({x})" + + @staticmethod + def sigmoid(x): + return f"tl.sigmoid({x})" + + @staticmethod + def libdevice_sigmoid(x): + return f"1/(1 + libdevice.exp(-({x})))" + + @staticmethod + def signbit(x): + # XX: This is wrong for the value -0.0 in floating point + return f"libdevice.signbit({x}) if ({x}).dtype is tl.float32 else {x} < 0" + + @staticmethod + def fmod(a, b): + return f"libdevice.fmod({a}, {b})" + + @staticmethod + def pow(a, b): + return f"libdevice.pow({a}, {b})" + + @staticmethod + def log(x): + return f"tl_math.log({x})" + + @staticmethod + def libdevice_log(x): + return f"libdevice.log({x})" + + @staticmethod + def isinf(x): + return f"libdevice.isinf({x}).to(tl.int1)" + + @staticmethod + def isnan(x): + return f"libdevice.isnan({x}).to(tl.int1)" + + @staticmethod + def round(x): + return f"libdevice.nearbyint({x})" + + @staticmethod + def floor(x): + return f"libdevice.floor({x})" + + @staticmethod + def floordiv(a, b): + # See the comment in lowering.div_mode. a and b are integer type. + # Similar to div_floor_kernel_cuda in pytorch core. + # Notice that // in triton behaves as truncdiv instead of floordiv + quot = f"{a} // {b}" + rem = f"{a} % {b}" + return f"tl.where(({a} < 0) != ({b} < 0), tl.where({rem} != 0, {quot} - 1, {quot}), {quot})" + + @staticmethod + def sign(x): + def to_int(s): + return f"{s}.to(tl.int8)" + + left = to_int(ops.lt("0", x)) + right = to_int(ops.lt(x, "0")) + sub = ops.sub(left, right) + return f"{sub}.to({x}.dtype)" + + @staticmethod + def trunc(x): + return f"libdevice.trunc({x})" + + @staticmethod + def truncdiv(a, b): + # See the comment in lowering.div_mode. a and b are integer type. + # Notice that // in triton behaves as truncdiv instead of floordiv + return f"{a} // {b}" + + @staticmethod + def ceil(x): + return f"libdevice.ceil({x})" + + +TritonOverrides._initialize_pointwise_overrides("triton") + + +# Use mypy to check protocol implemented correctly +def _typecheck_TritonOverrides(h: TritonOverrides) -> OpsHandler[str]: + return h + + +class TritonKernelOverrides(TritonOverrides): + """Map element-wise ops to Triton within a TritonKernel + + Unlike TritonOverrides, these assume the code is going to be inserted into + the body of the main triton kernel and so it may use indexing and mask + variables which are assumed to already be defined in the current scope. + """ + + @classmethod + def constant(cls, value, dtype): + # NOTE: Cannot use shape=[] as it's not supported by triton-rocm + # We could use shape=[1] instead but starting with the correct + # ndim avoids extra `tt.expand_dim` ops appearing in the triton IR. + ndim = V.kernel.triton_tensor_ndim() + shape = [1] * ndim + return cls._shaped_constant(value, dtype, shape=shape) + + @classmethod + def index_expr(cls, expr, dtype): + indexing = V.kernel.indexing(expr, block_ptr=False) + assert isinstance(indexing, IndexingOptions) + # This is called from CSEProxy.__getattr__, so we'll set the bounds there + var = V.kernel.cse.generate(V.kernel.compute, indexing.index_str) + + if dtype not in {torch.int32, torch.int64}: + var = V.kernel.cse.generate(V.kernel.compute, cls.to_dtype(var, dtype)) + var.mask_vars = indexing.mask_vars + return var + + @staticmethod + def masked(mask, body, other): + with V.kernel.mask_loads(mask) as new_mask: + result = body() + + # Take dtype from result to prevent accidental promotion + other = V.kernel.cse.generate( + V.kernel.compute, + f"tl.full({result}.shape, {triton_constant(other)}, {result}.dtype)", + ) + return ops.where(new_mask, result, other) + + @staticmethod + def load_seed(name, offset): + var = V.kernel.args.input(name) + return ( + f"tl.load({var} + {V.kernel.args.seed_offset('load_seed_offset', offset)})" + ) + + @staticmethod + def frexp(x): + cache_key = f"frexp({x})" + if cache_key in V.kernel.cse.cache: + return V.kernel.cse.cache[cache_key] + + mantissa = V.kernel.cse.newvar() + exponent = V.kernel.cse.newvar() + V.kernel.compute.writeline( + f"{mantissa}, {exponent} = triton_helpers.frexp({x})" + ) + V.kernel.cse.cache[cache_key] = (mantissa, exponent) + return (mantissa, exponent) + + +# Use mypy to check protocol implemented correctly +def _typecheck_TritonKernelOverrides(h: TritonKernelOverrides) -> OpsHandler[str]: + return h + + +@dataclasses.dataclass +class IterationRanges: + """ + Each range tree represents multiple sets of iteration indexing + in a single tiled dimension in the output kernel. + + If you have two loops ranges one (4, 3, 2) and another (4, 6), + then the range tree will be: + 4 (i0) + 3 (i1) 6 (i3) + 2 (i2) + Where i0 is shared between both loops, but then the split into + different indexing vars. All loop ranges must iterate over + the same number of elements. + """ + + def __init__( + self, + name: str, + var_list: List[sympy.Symbol], + var_ranges: Dict[sympy.Symbol, sympy.Expr], + numel: sympy.Expr, + prefix: str, + *, + kernel: TritonKernel, + divisor=sympy.Integer(1), + length=sympy.Integer(1), + root: IterationRangesRoot, + ): + super().__init__() + self.name = name + self.var_list = var_list + self.var_ranges = var_ranges + self.numel = numel + self.prefix = prefix + self.divisor = divisor + self.length = length + self.kernel = kernel + self.root = root + + def symbol(self): + return sympy_index_symbol(self.name) + + +class IterationRangesRoot(IterationRanges): + def __init__( + self, + name: str, + numel: sympy.Expr, + prefix: str, + index: int, + kernel: TritonKernel, + pid_cache=None, + *, + is_loop: bool, + tensor_dim: Optional[int], + grid_dim: Optional[int], + ): + if pid_cache is None: + pid_cache = {} + super().__init__( + name=name, + var_list=[], + var_ranges={}, + numel=numel, + prefix=prefix, + kernel=kernel, + root=self, + ) + self.index = index + # Store all the nodes in one flat list + self.nodes: Dict[sympy.Expr, IterationRangesEntry] = {} + # This is for re-ordering program ID in triton mm template + # pid_cache["tl.program_id(0)"] = pid_m + self.pid_cache: Dict[str, str] = pid_cache + + # True if the dimension is implemented as a single program looping over + # the full dimension (currently only used for non-persistent reduction) + assert not is_loop or (prefix == "r" and grid_dim is None) + self.is_loop = is_loop + # Index of corresponding dimension on triton tensors + self.tensor_dim = tensor_dim + # Index of corresponding dimension in the triton grid + self.grid_dim = grid_dim + + def __repr__(self): + return f"IterationRangesRoot({self.name!r}, {self.numel}, ...)" + + def cache_clear(self): + for node in self.nodes.values(): + node.cache_clear() + + def lookup(self, divisor, length): + """ + Lookup a given RangeTreeEntry, creating it if needed + """ + if V.graph.sizevars.statically_known_equals(divisor * length, self.numel): + expr = FloorDiv(sympy_index_symbol(f"{self.prefix}index"), divisor) + else: + expr = ModularIndexing( + sympy_index_symbol(f"{self.prefix}index"), divisor, length + ) + + if expr not in self.nodes: + node = IterationRangesEntry( + f"{self.prefix}{next(V.kernel.iter_vars_count)}", + divisor, + length, + expr, + self, + ) + V.kernel.range_tree_nodes[node.symbol()] = node + self.var_list.append(node.symbol()) + self.var_ranges[node.symbol()] = length + self.nodes[expr] = node + return self.nodes[expr] + + def construct_entries(self, lengths: List[sympy.Expr]): + divisor = sympy.Integer(1) + itervars = [] + for length in reversed(lengths): + itervars.append(self.lookup(divisor, length)) + divisor = divisor * length + return list(reversed(itervars)) + + def construct(self, lengths: List[sympy.Expr]): + return [e.symbol() for e in self.construct_entries(lengths)] + + def vars_and_sizes(self, index: sympy.Expr): + """Figure out vars from this tree used in index""" + nodes = [V.kernel.range_tree_nodes.get(s) for s in index.free_symbols] + nodes = [n for n in nodes if n and n.prefix == self.prefix] + nodes.sort(key=lambda x: V.graph.sizevars.size_hint(x.divisor)) + divisor = sympy.Integer(1) + index_vars = [] + sizes = [] + + def add(node): + nonlocal divisor + index_vars.append(node.symbol()) + sizes.append(node.length) + divisor = divisor * node.length + + for node in nodes: + if not V.graph.sizevars.statically_known_equals(node.divisor, divisor): + # fill in unused index var + add(self.lookup(divisor, FloorDiv(node.divisor, divisor))) + divisor = node.divisor + add(node) + if not V.graph.sizevars.statically_known_equals(self.numel, divisor): + # fill in unused index var + add(self.lookup(divisor, FloorDiv(self.numel, divisor))) + + return list(reversed(index_vars)), list(reversed(sizes)) + + def ranges_code(self): + assert self.tensor_dim is not None + size = self.kernel.indexing_size_str(self.tensor_dim) + index_dtype = self.kernel.index_dtype + convert = f".to({index_dtype})" if index_dtype != "tl.int32" else "" + return f"tl.arange(0, {self.prefix.upper()}BLOCK){size}{convert}" + + def scalar_code(self, value): + index_dtype = self.kernel.index_dtype + ndim = self.kernel.triton_tensor_ndim() + size = [1] * ndim + return f"tl.full({size}, {value}, {index_dtype})" + + def get_pid(self): + assert self.grid_dim is not None + key = f"tl.program_id({self.grid_dim})" + # y_grid has a limit, so express it in terms of y and z in case of overflow. + # z grid is only exercised when max_tiles == 3 (off by default). + if ( + self.grid_dim == 1 + and config.triton.max_tiles <= 2 + and not (isinstance(self.numel, int) and self.numel <= get_max_y_grid()) + ): + key = f"{key} * (tl.program_id({self.grid_dim + 1}) + 1)" + pid = self.pid_cache.get(key, key) + if self.kernel.index_dtype != "tl.int32": + return f"{pid}.to({self.kernel.index_dtype})" + return pid + + def codegen_header(self, code): + x = self.prefix + if self.is_loop: + code.writeline(f"{self.name} = {x}offset + {x}base") + elif self.grid_dim is None: + # no need to "{x}offset = " + code.writeline(f"{self.name} = {self.ranges_code()}") + code.writeline(f"{x}offset = 0") + else: + if self.tensor_dim is not None: + line = f"{x}offset + {self.ranges_code()}" + else: + line = self.scalar_code(f"{x}offset") + code.writelines( + [ + f"{x}offset = {self.get_pid()} * {x.upper()}BLOCK", + f"{self.name} = {line}", + ] + ) + code.writeline(f"{x}mask = {self.name} < {x}numel") + + +class IterationRangesEntry(IterationRanges): + def __init__( + self, + name: str, + divisor: sympy.Expr, + length: sympy.Expr, + expr: sympy.Expr, + parent: IterationRanges, + ): + super().__init__( + name=name, + numel=parent.numel / length, + var_list=parent.var_list, + var_ranges=parent.var_ranges, + prefix=parent.prefix, + divisor=divisor, + length=length, + kernel=parent.kernel, + root=parent.root, + ) + self.parent = parent + self.codegen = functools.lru_cache(None)(self._codegen) + self.expr = expr + + def __repr__(self): + return f"IterationRangesEntry({self.name}, {self.divisor}, {self.length}, {self.expr}, {self.var_ranges})" + + def set_name(self, name): + self.codegen = lambda: name # type: ignore[assignment] + self.codegen.cache_clear = lambda: None # type: ignore[method-assign] + self.name = name + + def cache_clear(self): + self.codegen.cache_clear() + + def writeline(self, line): + if self.root.is_loop: + V.kernel.indexing_code.writeline(line) + else: + # lift non-reduction stores outside loop + V.kernel.body.writeline(line) + + def _codegen(self): + self.writeline(f"{self.name} = " + texpr(V.kernel.rename_indexing(self.expr))) + return self.name + + def precomputed_args(self): + # for dynamic shapes, find parts of indexing expressions that have to be precomputed + precomputed_args: List[sympy.Expr] = [] + if isinstance(self.expr, sympy.Symbol): + return precomputed_args + assert isinstance(self.expr, (FloorDiv, ModularIndexing)), type(self.expr) + for arg in self.expr.args[1:]: + if not isinstance(arg, (sympy.Integer, sympy.Symbol)): + symbols = arg.free_symbols + if len(symbols) > 0 and all(s.name.startswith("s") for s in symbols): + precomputed_args.append(arg) + return precomputed_args + + def __hash__(self): + return hash(self.name) + + def __eq__(self, other): + return self.name == other.name + + +class HelperFunctions: + """An ordered set of helper functions.""" + + _templates_seen: Dict[str, str] # Template code to function name + finalized_helpers: List[str] + + def __init__(self): + self._templates_seen = {} + self.finalized_helpers = [] + + def add(self, template_code: str) -> str: + """This accepts a function definition with the function name + left as a format specifier e.g. + + @triton.jit + def {name}(arg0, arg1): + return arg0 + arg1 + + We add the templated code to the function set and return the name + assigned to that function. + + """ + existing_name = self._templates_seen.get(template_code) + if existing_name is not None: + # Don't duplicate existing helpers + return existing_name + + name = f"_triton_helper_fn{len(self.finalized_helpers)}" + self._templates_seen[template_code] = name + self.finalized_helpers.append(template_code.format(name=name)) + return name + + def __iter__(self): + return iter(self.finalized_helpers) + + def __getitem__(self, idx): + return self.finalized_helpers[idx] + + +class TritonKernel(Kernel): + overrides = TritonKernelOverrides # type: ignore[assignment] + sexpr = pexpr + + helper_functions: HelperFunctions + + def __init__( + self, + *groups, + index_dtype: str, + mutations: Optional[Set[str]] = None, + pid_cache=None, + reduction_hint=ReductionHint.DEFAULT, + min_elem_per_thread=0, + disable_persistent_reduction=False, + ): + if pid_cache is None: + pid_cache = {} + super().__init__() + self.numels = [V.graph.sizevars.simplify(s) for s in groups] + self.mutations: Set[str] = mutations if mutations is not None else set() + self.range_trees: List[IterationRangesRoot] = [] + self.range_tree_nodes: Dict[sympy.Symbol, IterationRangesEntry] = {} + self.iter_vars_count = itertools.count() + self.inside_reduction = self.numels[-1] != 1 + self.body = IndentedBuffer() + self.indexing_code = IndentedBuffer() + self.suffix: IndentedBuffer = IndentedBuffer() # type: ignore[assignment] + self.outside_loop_vars: Set[Any] = set() + self.reduction_hint = reduction_hint + self.index_dtype: str = index_dtype + self.min_elem_per_thread = min_elem_per_thread + self.last_usage: Set[str] = set() + self.block_ptr_id = itertools.count() + # buffer accesses in the kernel + self.buf_accesses: DefaultDict[str, List[Dep]] = collections.defaultdict(list) + + self.persistent_reduction: bool = ( + not disable_persistent_reduction + ) and self.should_use_persistent_reduction() + self.no_x_dim = ( + self.reduction_hint == ReductionHint.INNER + and self.persistent_reduction + and len(self.numels) == 2 + and self.numels[-1] >= 256 + ) + self.initialize_range_tree(pid_cache) + + self.helper_functions = HelperFunctions() + + # A set of autotuning hints to pass as part of triton_meta + self.autotune_hints: Set[AutotuneHint] = set() + + # define this in a closure to make cache local to object + @functools.lru_cache(None) + def simplify_indexing(index: sympy.Expr): + index = V.graph.sizevars.simplify_with_ranges(index, self.var_ranges()) + for tree in self.range_trees: + index = self.combine_contiguous_dims(index, tree) + return index + + self.simplify_indexing = simplify_indexing + self.code_hash = None + self.triton_meta: Optional[Dict[str, object]] = None + + def need_numel_args(self): + r""" + Indicate whether we need provide numel as arguments for the generated + kernel calls in the benchmark. + + Should be true for pointwise/reduction kernels but false for triton + matmul kernels. + """ + return True + + def should_use_persistent_reduction(self) -> bool: + """ + Heuristic to set self.persistent_reduction and add guards + if needed. + """ + if not (self.inside_reduction and config.triton.persistent_reductions): + return False + threshold = { + ReductionHint.INNER: 1024, + }.get(self.reduction_hint, 64) + + # If multi_kernel is enabled, we do more aggressive persistent reduction. + # This may result in some persisent reductions slower than the + # corresponding non-persistent reductions. MultiKernel will do benchmarking + # to pick the faster one. + if config.triton.multi_kernel: + threshold *= 16 + last_numel = self.numels[-1] + if not isinstance(last_numel, (int, sympy.Integer)): + # Not static + return False + hint = V.graph.sizevars.size_hint(last_numel) + if hint > threshold: + return False + # will need to recompile if we cross a larger power of 2 boundary + V.graph.sizevars.guard_leq(self.numels[-1], next_power_of_2(hint)) # type: ignore[arg-type] + return True + + def set_last_usage(self, nodes): + if not self.inside_reduction or self.persistent_reduction: + return + self.last_usage = set( + itertools.chain.from_iterable( + n.last_usage for n in nodes if n is not EnableReduction + ) + ) + + def initialize_range_tree(self, pid_cache): + no_r_dim = not self.inside_reduction or self.numels[-1] == 1 + + prefixes = "zyxr" + active_prefixes = prefixes[-len(self.numels) :] + + grid_dims = "xyz" + if self.no_x_dim: + tensor_dims = "r" + elif no_r_dim: + tensor_dims = "xyz" + else: + tensor_dims = "xyzr" + + tensor_dims = "".join(p for p in tensor_dims if p in active_prefixes) + + for i, prefix in enumerate(active_prefixes): + is_reduction = prefix == "r" + tensor_dim = tensor_dims.find(prefix) if prefix in tensor_dims else None + grid_dim = None if is_reduction else grid_dims.find(prefix) + index = i if grid_dim is None else grid_dim + self.range_trees.append( + IterationRangesRoot( + f"{prefix}index", + self.numels[i], + prefix, + index, + self, + pid_cache=pid_cache, + is_loop=is_reduction and not self.persistent_reduction, + tensor_dim=tensor_dim, + grid_dim=grid_dim, + ) + ) + for tree in self.range_trees: + # reduction indexing goes inside a loop + if not tree.is_loop: + tree.codegen_header(self.body) + if self.inside_reduction and self.range_trees[-1].is_loop: + # workaround for this issue: + # https://gist.github.com/jansel/6527126f781559095c5531f98a4235a7 + self.body.writeline(f"rbase = {self.range_trees[-1].ranges_code()}") + + def disable_reduction(self): + should_flush = self.range_trees[-1].is_loop + + @contextlib.contextmanager + def ctx(): + if self.numels[-1] == 1: + assert not self.inside_reduction + yield + return + if should_flush: + # calling codegen_body() will flush all the pending buffers + # and write out a reduction loop + self.codegen_body() + self.inside_reduction = False + try: + yield + if should_flush: + # flush out any code before opening the next loop + self.codegen_body() + finally: + self.inside_reduction = True + + return ctx() + + def set_ranges(self, *lengths): + assert len(lengths) == len(self.range_trees) + return [ + ranges.construct(length) + for length, ranges in zip(lengths, self.range_trees) + ] + + @staticmethod + def _split_iteration_ranges( + groups: Iterable[sympy.Expr], lengths: List[List[sympy.Expr]] + ): + sv = V.graph.sizevars + new_ranges: List[List[sympy.Expr]] = [[] for _ in groups] + remaining = [sv.simplify(g) for g in groups] + var_count = itertools.count() + + def add_range(i, expr): + expr = sv.simplify(expr) + if not sv.statically_known_multiple_of(remaining[i], expr): + raise CantSplit() + # guard on the last item out + remaining[i] = FloorDiv(remaining[i], expr) + new_ranges[i].append(expr) + return next(var_count) + + def make_combined(size, idx1, idx2): + def getter(flat_vars): + return size * flat_vars[idx1] + flat_vars[idx2] + + return getter + + return_getters_groups = [] + current_group = 0 + for length_group in lengths: + return_getters = [] + for size in length_group: + if sv.statically_known_equals(size, 1): # type: ignore[arg-type] + return_getters.append(lambda _: sympy.Integer(0)) + continue + + while ( + current_group < len(remaining) + and sv.size_hint(remaining[current_group]) == 1 + ): + # scroll to next group with remaining elements + current_group += 1 + + if sv.size_hint(size) > sv.size_hint(remaining[current_group]): + # need to break size in two + if not sv.statically_known_multiple_of( + size, remaining[current_group] + ): + raise CantSplit() + size1 = remaining[current_group] + size2 = FloorDiv(size, remaining[current_group]) + return_getters.append( + make_combined( + size2, + add_range(current_group, size1), + add_range(current_group + 1, size2), + ) + ) + else: + return_getters.append( + operator.itemgetter(add_range(current_group, size)) + ) + return_getters_groups.append(return_getters) + + assert all( + V.graph.sizevars.size_hint(s) == 1 for s in remaining + ), f"failed to set ranges {remaining} {lengths}" + + return new_ranges, return_getters_groups + + @classmethod + def is_compatible( + cls, groups: Iterable[sympy.Expr], lengths: List[List[sympy.Expr]] + ): + try: + cls._split_iteration_ranges(groups, lengths) + return True + except CantSplit: + return False + + def split_and_set_ranges(self, lengths: List[List[sympy.Expr]]): + """ + We may want to fuse `for i0 in s0*s1` into a tiled kernel with groups (s0, s1). + + To do this we need to split up the iteration space of i0 into something like: + for i1 in s0: + for i2 in s1: + i0 = i1*s1 + i2 + .... + + This function matches and resplits lengths to the groups of + this kernel to enable tiled + non-tiled fusions. + """ + groups = [rt.numel for rt in self.range_trees] + if not self.inside_reduction: + groups[-1] = sympy.Integer(1) + + if len(lengths) == len(self.range_trees) and all( + V.graph.sizevars.simplify(sympy_product(x) - g) == 0 + for x, g in zip(lengths, groups) + ): + return self.set_ranges(*lengths) + + new_ranges, return_getters_groups = self._split_iteration_ranges( + groups, lengths + ) + itervars = list(itertools.chain.from_iterable(self.set_ranges(*new_ranges))) + return [[fn(itervars) for fn in fns] for fns in return_getters_groups] + + def is_indirect_indexing(self, index: sympy.Expr): + # tmpX means indirect indexing + return free_symbol_startswith(index, "tmp") + + def is_broadcasted(self, index: sympy.Expr): + # Note. This may not be correct when there is indirect indexing + if self.is_indirect_indexing(index): + return False + + index_numels = [1] * len(self.numels) + for symbol in index.free_symbols: + if symbol not in self.range_tree_nodes: + # Non-iterated variables, e.g. strides + continue + entry = self.range_tree_nodes[symbol] # type: ignore[index] + assert isinstance(entry.parent, IterationRangesRoot) + index_numels[entry.parent.index] *= entry.length + + # If the index variables only iterate over a subset of the kernel + # numels, then it must be broadcasted. + simplify = V.graph.sizevars.simplify + return any( + simplify(idx_range) != simplify(iter_range) # type: ignore[arg-type] + for idx_range, iter_range in zip(index_numels, self.numels) + ) + + def combine_contiguous_dims(self, index: sympy.Expr, tree: IterationRangesRoot): + """ + More aggressive simplification to merge contiguous dims + """ + if isinstance(index, (sympy.Integer, sympy.Symbol)): + return index + index_vars, sizes = tree.vars_and_sizes(index) + if len(sizes) <= 1: + return index + new_sizes, reindex, prune = V.graph.sizevars._simplify_loops( + index_vars, sizes, index_prevent_reordering([index], index_vars, sizes) + ) + if new_sizes == sizes: + return index + new_index_vars = tree.construct(new_sizes) + new_index = sympy_subs(index, dict(zip(index_vars, reindex(new_index_vars)))) + return new_index + + def index_to_str(self, index: sympy.Expr) -> str: + """ + Convert an index expr to a string that can be used in triton code. + e.g. a sympy expression "s2" may actually appear as "ks1" in the triton kernel. + + Index expressions often need to be passed in as arguments to the triton kernel. + Rename_indexing and codegen_indexing keep track of the needed indices and add + new parameters to the function signature. + """ + if isinstance(index, list): + return f"[{', '.join(map(self.index_to_str, index))}]" + return texpr(self.rename_indexing(self.codegen_indexing(index))) + + def indexing( + self, + index: sympy.Expr, + *, + copy_shape=None, + dense_indexing=False, + override_mask=None, + block_ptr=False, + ) -> Union[IndexingOptions, BlockPtrOptions]: + """ + Compute the index and mask to pass to tl.load() or tl.store() + """ + index = self.simplify_indexing(index) + index = sympy_subs(index, V.graph.sizevars.precomputed_replacements) + # if simple replacements didn't get rid of floor/ceil, try full subs + if len(index.atoms(sympy.floor)) or len(index.atoms(sympy.ceiling)): + index = index.subs(V.graph.sizevars.precomputed_replacements) + # last resort, if no range vars are in the expr, hoist it + # TODO instead of trying to blindly find complicated exprs, we should hoist the + # inputs/outputs sizes and strides, but at the time indexing is generated + # kernel inputs and outputs are not set yet, we'd need a deeper refactor + # to do it this way + + if len(index.atoms(sympy.ceiling)): + for a in index.atoms(sympy.ceiling): + # for nested exprs, atoms yields top level first (?) + # so if everything goes fine, lower level replacements will come up empty + symbols = a.free_symbols + if len(symbols) > 0 and all( + s.name.startswith("s") or s.name.startswith("ps") for s in symbols + ): + replacements = {a: V.graph.sizevars.lookup_precomputed_size(a)} + index = sympy_subs(index, replacements) + + index = self.simplify_indexing(index) + index_vars = index.free_symbols + has_rindex = False + + mask_vars: Set[str] = set() + for var in index_vars: + assert isinstance(var, sympy.Symbol) + has_rindex = has_rindex or var.name.startswith("r") + if override_mask: + pass + elif var.name.startswith("tmp"): + # indirect indexing + cse_var = self.cse.varname_map[var.name] + mask_vars.update(cse_var.mask_vars) + elif var.name.startswith(("s", "ps", "i", "u")): + pass + else: + # var is one of xN, yN or rN + assert var.name[0] in "xyr", var.name + mask_vars.add(f"{var.name[0]}mask") + + need_dense = ( + config.triton.dense_indexing + or dense_indexing + or self._load_mask is not None + ) and index != 0 + + have_dense = True + have_loop_vars = False + dense_mask_vars = set() + + for tree in self.active_range_trees(): + if index_vars.intersection(tree.var_list): + have_loop_vars = True + else: + have_dense = False + dense_mask_vars.add(f"{tree.prefix}mask") + + if ( + block_ptr + and config.triton.use_block_ptr + and not override_mask + and not self._load_mask + and len(mask_vars - dense_mask_vars) == 0 + and not self.is_indirect_indexing(index) + and have_loop_vars + # workaround https://github.com/openai/triton/issues/2821 + and self.index_dtype == "tl.int32" + ): + index_relative_to_xyr_index = sympy_subs( + index, {v: t.expr for v, t in self.range_tree_nodes.items()} + ) + range_trees = self.active_range_trees(reorder=True) + symbols = [t.symbol() for t in range_trees] + strides = [sympy.Wild(f"stride_{s}", exclude=symbols) for s in symbols] + offset = sympy.Wild("_offset", exclude=symbols) + m = index_relative_to_xyr_index.match(sympy_dot(symbols, strides) + offset) + # TODO(jansel): it is sometimes possible to do higher dimensional block_ptrs with + # a tl.reshape the correct block. We will miss these cases today. + if m: + self.filter_masks(mask_vars) + return BlockPtrOptions.create( + [m[s] for s in strides], + m[offset], + range_trees, + mask_vars, # type: ignore[arg-type] + ) + + expand_str = None + index_str = self.index_to_str(index) + if isinstance(index, sympy.Integer): + expand_str = f"{copy_shape}.shape" if copy_shape else self.dense_size_str() + index_str = f"tl.full({expand_str}, {index_str}, tl.int32)" + return IndexingOptions(index_str, set(), "None", expand_str, has_rindex) + + if need_dense and not have_dense: + expand_str = f"{copy_shape}.shape" if copy_shape else self.dense_size_str() + index_str = f"tl.broadcast_to({index_str}, {expand_str})" + mask_vars = dense_mask_vars + elif not have_loop_vars and copy_shape: + index_str = f"tl.broadcast_to({index_str}, {copy_shape}.shape)" + mask_vars = dense_mask_vars + + if override_mask: + mask_vars = {override_mask} + + if self._load_mask: + mask_vars.add(self._load_mask) + + self.filter_masks(mask_vars) + + mask_str = " & ".join(sorted(map(str, mask_vars))) if mask_vars else "None" + return IndexingOptions(index_str, mask_vars, mask_str, expand_str, has_rindex) # type: ignore[arg-type] + + def active_range_trees(self, reorder=False): + trees = [ + t for t in self.range_trees if t.prefix != "r" or self.inside_reduction + ] + if reorder and len(trees) > 1: + count = sum(t.prefix in "xyz" for t in trees) + assert "".join(t.prefix for t in trees[:count]) == "zyx"[-count:], [ + t.prefix for t in trees[:count] + ] + trees[:count] = reversed(trees[:count]) + return trees + + def filter_masks(self, mask_vars): + for tree in self.range_trees: + # Masks are superfluous if we only have one element + if V.graph.sizevars.statically_known_equals(tree.numel, 1): # type: ignore[arg-type] + mask_vars.discard(f"{tree.prefix}mask") + continue + # Masks are superfluous if numel is a multiple of BLOCK + # (We use the fact that BLOCK is required by triton to be a power of 2) + if tree.prefix.upper() not in config.triton.max_block: + continue + max_block = config.triton.max_block[tree.prefix.upper()] + # Optional optimization: if block divides numel exactly, we will + # never need to do a masked load to handle stragglers at the end. + # It's faster to avoid masking at all. But it is sound to always + # mask. + if V.graph.sizevars.statically_known_multiple_of(tree.numel, max_block): # type: ignore[arg-type] + mask_vars.discard(f"{tree.prefix}mask") + + def var_ranges(self): + return dict( + itertools.chain.from_iterable( + tree.var_ranges.items() for tree in self.range_trees + ) + ) + + def codegen_indexing(self, expr: sympy.Expr): + expr = V.graph.sizevars.simplify_with_ranges(expr, self.var_ranges()) + for sym in sorted(expr.free_symbols, key=str): + if sym in self.range_tree_nodes: + # if indexing expression is complicated, we precompute it on the host side + # and send the result as a kernel argument + replacements = {} + for ps in self.range_tree_nodes[sym].precomputed_args(): # type: ignore[index] + replacements[ps] = V.graph.sizevars.lookup_precomputed_size(ps) + if len(replacements) > 0: + self.range_tree_nodes[sym].expr = sympy_subs( # type: ignore[index] + self.range_tree_nodes[sym].expr, replacements # type: ignore[index] + ) + self.range_tree_nodes[sym].codegen() # type: ignore[index] + return expr + + @contextlib.contextmanager + def mask_loads(self, mask): + """Context manager to add an additional mask to tl.load/store""" + prior = self._load_mask + if prior: + mask = self.cse.generate(self.compute, f"{mask} & {prior}") + + self._load_mask = mask + try: + # TODO(jansel): do we need a reshape here? + yield mask + finally: + self._load_mask = prior + + def generate_assert(self, check): + return torch.version.hip is None and super().generate_assert(check) + + def load_mask(self, var): + mask = "" + mask_vars = set(var.mask_vars) + if self._load_mask: + mask_vars.add(self._load_mask) + + if mask_vars: + mask = ( + f"{next(iter(mask_vars))}" + if len(mask_vars) == 1 + else f"({' & '.join(str(v) for v in mask_vars)})" + ) + return mask + + @property + def assert_function(self) -> str: + return "tl.device_assert" + + def get_strides_of_load(self, index: sympy.Expr): + """ + This gets the stride of the index for each of the tiling variables + (technically, it does it at index 0) + + For example, if + xindex = x0 + 512*x1 + 1024*r0 + x0 = (xindex//512) + x1 = (xindex % 512) + r0 = rindex // 1024 + + this function would return + {xindex: 512, rindex: 1024} + """ + index_to_tile_indexes = {k: v.expr for k, v in self.range_tree_nodes.items()} + index_in_tile_vars = sympy_subs(index, index_to_tile_indexes) # type: ignore[arg-type] + strides = {} + for range_tree in self.range_trees: + s = sympy_index_symbol(range_tree.name) + strides[s] = sympy_subs(index_in_tile_vars, {s: 1}) - sympy_subs( + index_in_tile_vars, {s: 0} + ) + return strides + + def codegen_block_ptr( + self, name: str, var: str, indexing: BlockPtrOptions, other="" + ) -> Tuple[str, Optional[DeferredLine], str]: + advance_block_ptr = None + check = indexing.boundary_check() + if not check: + # workaround https://github.com/openai/triton/issues/2813 + other = "" + elif other: + assert other == ", other=0.0" + other = f", boundary_check={check!r}, padding_option='zero'" + else: + other = f", boundary_check={check!r}" + if ( + self.inside_reduction + and self.range_trees[-1].is_loop + and indexing.has_rindex() + ): + block_ptr = f"block_ptr{next(self.block_ptr_id)}" + self.body.writeline( + DeferredLine( + name, f"{block_ptr} = {indexing.format(var, roffset=False)}" + ) + ) + advance_block_ptr = DeferredLine( + name, + f"{block_ptr} = tl.advance({block_ptr}, {indexing.advance_roffset()})", + ) + else: + block_ptr = indexing.format(var) + return block_ptr, advance_block_ptr, other + + def codegen_block_ptr_store_line(self, name, indexing, block_ptr, value, other=""): + # broadcasting is not implicit for block_ptrs + value = ( + f"tl.broadcast_to({value}, {self.index_to_str(indexing.reshape_suffix)})" + ) + # drop any extra size=1 dimensions + value = triton_reshape(value, indexing.reshape_suffix, indexing.block_shape) + # workaround https://github.com/openai/triton/issues/2814 + value = f"{value}.to({triton_store_type(V.graph.get_dtype(name))})" + return f"tl.store({block_ptr}, {value}{other})" + + def load(self, name: str, index: sympy.Expr): + var = self.args.input(name) + indirect_indexing = self.is_indirect_indexing(index) + original_index = index + indexing = self.indexing(index, block_ptr=True) + has_rindex = indexing.has_rindex() + has_tmpmask = indexing.has_tmpmask() + + # Keep the variable in cache if were going to reuse it. Equiv., if any of the following hold + # 1) We are doing broadcasting + # 2) It is a non-coalesced load. The intuition is that if it's + # non-coalesced, we will likely load each element multiple times in + # practice. + # 3) It will be used later and it won't be CSE'd. Equiv., if all the following hold + # 3.1) We are in a reduction loop + # 3.2) Its not its last use + # 3.3) This load will not be lifted to the body + # + is_coalesced = any( + i == 1 for i in self.get_strides_of_load(original_index).values() + ) + if self.is_broadcasted(original_index): + ep = ", eviction_policy='evict_last'" + elif not is_coalesced: + ep = ", eviction_policy='evict_last'" + elif self.inside_reduction and self.range_trees[-1].is_loop: + if name in self.args.inplace_buffers: + names = set(self.args.inplace_buffers[name].other_names) + else: + names = {name} + last_use = len(names & self.last_usage) > 0 + evict_last = not last_use and (has_rindex or indirect_indexing) + if evict_last: + ep = ", eviction_policy='evict_last'" + else: + ep = ", eviction_policy='evict_first'" + else: + ep = "" + # "other" below is a workaround for https://github.com/openai/triton/issues/737 + # for bool, even though it's likely subject to the same bug, setting `other` leads + # to LLVM errors so we are skipping it for now + if ( + (has_tmpmask or has_rindex) + and V.graph.get_dtype(name) != torch.bool + and indexing.has_mask() + ): + other = ", other=0.0" + else: + other = "" + + advance_block_ptr = None + append_broadcast = None + if V.graph.is_unspec_arg(name): + line = var + else: + if isinstance(indexing, BlockPtrOptions): + block_ptr, advance_block_ptr, other = self.codegen_block_ptr( + name, var, indexing, other + ) + line = f"tl.load({block_ptr}{other}{ep})" + # add needed size=1 dimensions + line = triton_reshape( + line, indexing.block_shape, indexing.reshape_suffix + ) + elif isinstance(original_index, sympy.Integer): + line = f"tl.load({var} + ({original_index}))" + append_broadcast = indexing.expand_str + else: + line = f"tl.load({var} + ({indexing.index_str}), {indexing.mask_str}{ep}{other})" + + dtype = V.graph.get_dtype(name) + if dtype in (torch.float16, torch.bfloat16): + line += ".to(tl.float32)" + if dtype == torch.bool and torch.version.hip is None: + # Workaround for https://github.com/openai/triton/issues/2151 + # tl.load returns int8 when loading from pointer to int1 + # NOTE: Currently causes hangs on bool UTs for ROCm + line += ".to(tl.int1)" + + if has_tmpmask: + # Masked loads must come after the mask is computed + load_buffer = self.compute + elif ( + self.inside_reduction + and self.range_trees[-1].is_loop + and not indirect_indexing + and not has_rindex + ): + # can lift a common load outside of reduction loop + # One exception is when this is an indirect_load. + load_buffer = self.body + else: + load_buffer = self.loads + + result_var = self.cse.generate(load_buffer, line) + assert isinstance(result_var, TritonCSEVariable) + result_var.mask_vars = indexing.mask_vars # type: ignore[assignment] + + if append_broadcast: + line = f"tl.broadcast_to({result_var}, {append_broadcast})" + result_var = self.cse.generate(load_buffer, line) + + if advance_block_ptr: + load_buffer.writeline(advance_block_ptr) + + if not self.inside_reduction or (not indexing.has_rmask() and not has_rindex): + self.outside_loop_vars.add(result_var) + + return result_var + + def store( + self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None + ) -> None: + var = self.args.output(name) + original_index = index + indexing = self.indexing(index, dense_indexing=True, block_ptr=mode is None) + + # Guard against write-after-read corruption in triton. + # See # https://github.com/openai/triton/issues/1615 + # This triton bug means that a load which is broadcasted over multiple + # warps may see the result of a store that happens later in the triton + # program. The workaround is to add a barrier before storing, which + # enforces that all warps have already read the data. + is_inplace = name in self.args.inplace_buffers + is_broadcasted = self.is_broadcasted(original_index) + if is_inplace and is_broadcasted: + self.stores.writeline(DeferredLine(name, "tl.debug_barrier()")) + + advance_block_ptr = None + if isinstance(indexing, BlockPtrOptions): + block_ptr, advance_block_ptr, other = self.codegen_block_ptr( + name, var, indexing + ) + # block_ptr stores don't do implicit casting + line = self.codegen_block_ptr_store_line( + name, indexing, block_ptr, value, other + ) + elif mode is None: + line = f"tl.store({var} + ({indexing.index_str}), {value}, {indexing.mask_str})" + elif mode == "atomic_add": + line = f"tl.atomic_add({var} + ({indexing.index_str}), {value}, {indexing.mask_str})" + else: + raise NotImplementedError(f"store mode={mode}") + self.stores.writeline(DeferredLine(name, line)) + if advance_block_ptr: + self.stores.writeline(advance_block_ptr) + + if not self.inside_reduction: + self.outside_loop_vars.add(value) + + def bucketize( + self, + values: CSEVariable, + offsets_name: str, + offsets_size: sympy.Expr, + indexing_dtype: torch.dtype, + right: bool, + ) -> CSEVariable: + """ + See [Note: Inductor bucketize op] + """ + + # Triton performance for bucketize_binary_search is much better when the number + # of threads equals the number of elements. + # If we're trying to use a bucketize kernel, we should make sure that an + # autotuning config with num_elements_per_warp=32 exists. + self.autotune_hints.add(AutotuneHint.ELEMENTS_PER_WARP_32) + + offsets_ptr = self.args.input(offsets_name) + block_size = self.dense_size_str() + offsets_size_str = self.index_to_str(offsets_size) + + if indexing_dtype == torch.int32: + triton_dtype = "tl.int32" + elif indexing_dtype == torch.int64: + triton_dtype = "tl.int64" + else: + raise NotImplementedError( + "Bucketize only supports indexing with int32 and int64" + ) + + result = self.cse.generate( + self.compute, + f"triton_helpers.bucketize_binary_search({values}, {offsets_ptr}, {triton_dtype}, {right}, {offsets_size_str}, {block_size})", # noqa: B950 line too long + ) + + return result + + def reduction_resize(self, value): + ndims = self.triton_tensor_ndim() + if ndims == 1: + return f"triton_helpers.promote_to_tensor({value})" + + sizes = [":"] * ndims + sizes[-1] = "None" + return f"{value}[{', '.join(sizes)}]" + + @staticmethod + def _map_tuple_or_scalar(fn, value): + if isinstance(value, tuple): + return tuple(map(fn, value)) + return fn(value) + + def reduction( + self, + dtype: torch.dtype, + src_dtype: torch.dtype, + reduction_type: ReductionType, + value: Union[CSEVariable, Tuple[CSEVariable, ...]], + ) -> Union[CSEVariable, Tuple[CSEVariable, ...]]: + assert self.inside_reduction + masks = {f"{tree.prefix}mask" for tree in self.range_trees} + self.filter_masks(masks) + masks = sorted(masks) + if self._load_mask: + masks.append(self._load_mask) + reduction_range_prefix = self.range_trees[-1].prefix + + # Say we have + # tmp0 = ops.constant(1, torch.int64) + # tmp1 = ops.reduction(torch.int64, torch.int64, "sum", tmp0) + # tmp0 in the triton code is either a scalar, or single-element tensor + # so if we emit tl.sum directly, it will only give 1 instead of RBLOCK * 1 + # To avoid this, we broadcast to the expected shape first. + dense_size_str = self.dense_size_str() + value = self._map_tuple_or_scalar( + lambda v: self.cse.generate( + self.compute, f"tl.broadcast_to({v}, {dense_size_str})" + ), + value, + ) + + dim: int + root_op: str + + def final_reduction(value): + use_helper = reduction_type in {"any", "max", "min", "prod"} + module = "triton_helpers" if use_helper else "tl" + if reduction_type in {"max", "min"}: + return self.reduction_resize( + f"{module}.{reduction_type}2({value}, {dim})" + ) + return self.reduction_resize(f"{module}.{reduction_type}({value}, {dim})") + + def final_argreduce(buffer, result_var, value, index): + buffer.splice( + f"""\ + _, {result_var}_tmp = triton_helpers.{root_op}_with_index({value}, {index}, {dim}) + {result_var} = {self.reduction_resize(f'{result_var}_tmp')} + """ + ) + + cache_key = (src_dtype, reduction_type, value) + if cache_key in self.cse.reduction_cache: + return self.cse.reduction_cache[cache_key] + + dim = self.triton_tensor_ndim() - 1 + acc_type = triton_acc_type(src_dtype) + result_var: Any = self.cse.newvar() + result_var.mask_vars = {var for var in masks if var[0] != "r"} + cond = " & ".join(masks) + + def where_cond(tval, fval): + if not cond: + return tval + return TritonKernelOverrides.where(cond, tval, fval) + + if self.persistent_reduction: + default = ir.Reduction.default_value(reduction_type, src_dtype) + default = self._map_tuple_or_scalar(triton_constant, default) + + def _mask_value(value, default): + return self.cse.generate(self.compute, where_cond(value, default)) + + if isinstance(value, tuple): + masked_value = [_mask_value(v, d) for v, d in zip(value, default)] + else: + masked_value = _mask_value(value, default) + + if reduction_type in {"argmax", "argmin"}: + accumulator_index = str( + self.cse.generate( + self.compute, + f"tl.broadcast_to({reduction_range_prefix}index, {masked_value}.shape)", + ) + ) + root_op = {"argmax": "max", "argmin": "min"}[reduction_type] + final_argreduce( + self.compute, result_var, masked_value, accumulator_index + ) + elif reduction_type == "welford_reduce": + # For persistent reductions, don't bother with + # welford's algorithm since it uses more registers, and + # taking two reductions doesn't increase memory usage. + sum_ = ops.reduction(dtype, dtype, "sum", value) + self.inside_reduction = False + rnumel = ops.index_expr(self.numels[-1], dtype) + mean = ops.truediv(sum_, rnumel) + + self.inside_reduction = True + dx = ops.sub(value, mean) + dx2 = ops.mul(dx, dx) + m2 = ops.reduction(dtype, dtype, "sum", dx2) + result_var = (mean, m2, rnumel) + elif reduction_type == "welford_combine": + mean, m2, weight = masked_value + welford = f"triton_helpers.welford({mean}, {m2}, {weight}, {dim})" + mean, m2, weight = (self.cse.newvar() for _ in range(3)) + self.compute.writeline(f"{mean}, {m2}, {weight} = {welford}") + + result_var = tuple( + self.cse.generate(self.compute, self.reduction_resize(var_name)) + for var_name in (mean, m2, weight) + ) + else: + result_var = self.cse.generate( + self.compute, final_reduction(masked_value) + ) + else: + accumulator = f"_{result_var}" + default = ir.Reduction.default_accumulator(reduction_type, src_dtype) + default = self._map_tuple_or_scalar(triton_constant, default) + if not isinstance(default, tuple): + self.body.writeline( + f"{accumulator} = tl.full({self.dense_size_str()}, {default}, {acc_type})" + ) + + if reduction_type in {"argmax", "argmin"}: + accumulator_index = f"_{result_var}_index" + long_max = torch.iinfo(torch.int64).max + self.body.writeline( + f"{accumulator_index} = tl.full({self.dense_size_str()}, {long_max}, tl.int64)" + ) + root_op = {"argmax": "max", "argmin": "min"}[reduction_type] + + self.compute.splice( + f"""\ + {accumulator}_next, {accumulator_index}_next = triton_helpers.{root_op}imum_with_index( + {accumulator}, {accumulator_index}, {value}, {reduction_range_prefix}index + ) + {accumulator} = {where_cond(f'{accumulator}_next', accumulator)} + {accumulator_index} = {where_cond(f'{accumulator_index}_next', accumulator_index)} + """ + ) + final_argreduce(self.suffix, result_var, accumulator, accumulator_index) + elif is_welford_reduction(reduction_type): + accumulator = f"{result_var}_mean" + accumulator_m2 = f"{result_var}_m2" + accumulator_weight = f"{result_var}_weight" + self.body.writeline( + f"{accumulator} = tl.zeros({self.dense_size_str()}, {acc_type})" + ) + self.body.writeline( + f"{accumulator_m2} = tl.zeros({self.dense_size_str()}, {acc_type})" + ) + self.body.writeline( + f"{accumulator_weight} = tl.zeros({self.dense_size_str()}, {acc_type})" + ) + + if reduction_type == "welford_combine": + mean, m2, weight = value + self.compute.splice( + f"""\ + {accumulator}_next, {accumulator_m2}_next, {accumulator_weight}_next = triton_helpers.welford_combine( + {accumulator}, {accumulator_m2}, {accumulator_weight}, + {mean}, {m2}, {weight} + ) + """ + ) + else: + assert reduction_type == "welford_reduce" + self.compute.splice( + f"""\ + {accumulator}_next, {accumulator_m2}_next, {accumulator_weight}_next = triton_helpers.welford_reduce( + {value}, {accumulator}, {accumulator_m2}, {accumulator_weight}, roffset == 0 + ) + """ + ) + + self.compute.splice( + f"""\ + {accumulator} = {where_cond(f'{accumulator}_next', accumulator)} + {accumulator_m2} = {where_cond(f'{accumulator_m2}_next', accumulator_m2)} + {accumulator_weight} = {where_cond(f'{accumulator_weight}_next', accumulator_weight)} + """ + ) + + result_mean = result_var + result_m2 = self.cse.newvar() + result_weight = self.cse.newvar() + self.suffix.splice( + f"""\ + {result_mean}_tmp, {result_m2}_tmp, {result_weight}_tmp = triton_helpers.welford( + {accumulator}, {accumulator_m2}, {accumulator_weight}, {dim} + ) + {result_mean} = {self.reduction_resize(f'{result_mean}_tmp')} + {result_m2} = {self.reduction_resize(f'{result_m2}_tmp')} + {result_weight} = {self.reduction_resize(f'{result_weight}_tmp')} + """ + ) + result_var = result_mean, result_m2, result_weight + else: + combine_fn = ir.get_reduction_combine_fn(reduction_type, src_dtype) + updated = combine_fn(accumulator, value) + self.compute.writeline( + f"{accumulator} = {where_cond(updated, accumulator)}" + ) + + if src_dtype == torch.bool: + # This is only really used for aten.any. It changes the + # final reduction of a non-persistent reduction from + # tmp5 = triton_helpers.max(_tmp5, 1)[:, None] + # to + # tmp5 = triton_helpers.max(_tmp5.to(tl.int8), 1)[:, None].to(tl.int1) + # which is needed because tl.reduce doesn't support tl.int1 + accumulator = f"{accumulator}.to(tl.int8)" + result_type = triton_compute_type(dtype) + self.suffix.writeline( + f"{result_var} = {final_reduction(accumulator)}.to({result_type})" + ) + else: + self.suffix.writeline( + f"{result_var} = {final_reduction(accumulator)}" + ) + + self.cse.reduction_cache[cache_key] = result_var + + if isinstance(result_var, tuple): + self.outside_loop_vars |= set(result_var) + else: + self.outside_loop_vars.add(result_var) + + return result_var + + def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable): + assert self.inside_reduction + self.inside_reduction = False + indexing = self.indexing(index, block_ptr=True) + self.inside_reduction = True + var = self.args.output(name) + + if isinstance(indexing, BlockPtrOptions): + self.suffix.writeline( + DeferredLine( + name, + self.codegen_block_ptr_store_line( + name, + indexing, + indexing.format(var), + value, + f", boundary_check={indexing.boundary_check()!r}", + ), + ) + ) + else: + assert isinstance(indexing, IndexingOptions) + self.suffix.writeline( + DeferredLine( + name, + f"tl.store({var} + ({indexing.index_str}), {value}, {indexing.mask_str})", + ) + ) + + def _lift_helper(self, fn, num_args) -> str: + # Lift IR function into a triton function in the global namespace + helper = IndentedBuffer() + helper.writeline("@triton.jit") + args = [f"arg{n}" for n in range(num_args)] + signature = ", ".join(args) + helper.writeline(f"def {{name}}({signature}):") + + cse = CSE(prefix="", suffix="") + overrides = TritonOverrides(V.MockHandler()) + + class CSEProxy: + def __getattr__(self, name: str) -> Callable[..., CSEVariable]: + def inner(*args, **kwargs): + return cse.generate( + helper, + getattr(overrides, name)(*args, **kwargs), + ) + + return inner + + with helper.indent(), V.set_ops_handler(CSEProxy()): + outputs = fn(*args) + helper.writeline(f"return {outputs}") + + return self.helper_functions.add(helper.getvalue()) + + def scan( + self, + dtype: torch.dtype, + combine_fn: Callable[[CSEVariable, CSEVariable], CSEVariable], + value: CSEVariable, + init: int, + ) -> CSEVariable: + assert self.inside_reduction + masks = {f"{tree.prefix}mask" for tree in self.range_trees} + self.filter_masks(masks) + masks = sorted(masks) + if self._load_mask: + masks.append(self._load_mask) + reduction_range_prefix = self.range_trees[-1].prefix + + value = self.cse.generate( + self.compute, f"tl.broadcast_to({value}, {self.dense_size_str()})" + ) + + default = triton_constant(init) + dim = self.triton_tensor_ndim() - 1 + acc_type = triton_acc_type(dtype) + cond = " & ".join(masks) + + combine_helper_fn = self._lift_helper(combine_fn, 2) + + def where_cond(value): + if not cond: + return value + default_tensor = self.cse.generate( + self.body, + f"tl.full({[1] * self.triton_tensor_ndim()}, {default}, {triton_compute_type(dtype)})", + ) + return self.cse.generate( + self.compute, f"tl.where({cond}, {value}, {default_tensor})" + ) + + if self.persistent_reduction: + masked_value = where_cond(value) + result_var = self.cse.generate( + self.compute, + f"tl.associative_scan({masked_value}, {dim}, {combine_helper_fn})", + ) + else: + accumulator = self.cse.newvar() + reduced_size = self.dense_size_list() + reduced_size[-1] = "1" + reduced_size = f"[{', '.join(reduced_size)}]" + + self.body.writeline( + f"{accumulator} = tl.full({reduced_size}, {default}, {acc_type})" + ) + + masked_value = where_cond(value) + partial_reduce = self.cse.generate( + self.compute, + self.reduction_resize( + f"tl.reduce({value}, {dim}, {combine_helper_fn})" + ), + ) + acc_next = combine_fn(accumulator, partial_reduce) + partial_scan = self.cse.generate( + self.compute, + f"tl.associative_scan({masked_value}, {dim}, {combine_helper_fn})", + ) + result_var = self.cse.generate( + self.compute, combine_fn(accumulator, partial_scan) + ) + self.compute.writeline(f"{accumulator} = {acc_next}") + + result_var.mask_vars = masks # type: ignore[attr-defined] + return result_var + + def codegen_body(self): + """ + Concat output code from index_code, loads, compute, stores, + suffix into self.body. + + For pointwise kernels, this is called just once at the end. + + For reduction kernels, this generates a loop over the reduction + axis. + """ + if not ( + self.indexing_code + or self.loads + or self.stores + or self.compute + or self.suffix + ): + return + + if self.inside_reduction and self.range_trees[-1].is_loop: + self.body.writeline("for roffset in range(0, rnumel, RBLOCK):") + with self.body.indent(): + # last range tree is always reduction + self.range_trees[-1].codegen_header(self.body) + self.body.splice(self.indexing_code) + self.body.splice(self.loads) + self.body.splice(self.compute) + self.body.splice(self.stores) + + # invalidate any caches that came from inside the reduction loop + self.cse.invalidate(self.outside_loop_vars) + self.range_trees[-1].cache_clear() + else: + self.body.splice(self.indexing_code) + self.body.splice(self.loads) + self.body.splice(self.compute) + self.body.splice(self.stores) + self.body.splice(self.suffix) + self.indexing_code.clear() + self.loads.clear() + self.compute.clear() + self.stores.clear() + self.suffix.clear() + + def codegen_kernel_benchmark(self, num_gb, grid=None): + result = IndentedBuffer() + argdefs, call_args, signature = self.args.python_argdefs() + + result.writelines(["", "", "def get_args():"]) + with result.indent(): + name_cnt = itertools.count() + var_names = [] + for arg_name, arg_sig in zip(call_args, signature): + var_name = f"arg_{next(name_cnt)}" + buf = V.graph.get_buffer(arg_name) + if buf: + result.writeline( + f"{var_name} = rand_strided({V.graph.sizevars.size_hints(buf.get_size())}, {V.graph.sizevars.size_hints(buf.get_stride())}, device='{buf.get_device()}', dtype={buf.get_dtype()})" # noqa: B950 line too long + ) + elif arg_name in V.graph.constants: + # note that random seed is put in V.graph.constants + const_tensor = V.graph.constants[arg_name] + result.writeline( + f"{var_name} = rand_strided({V.graph.sizevars.size_hints(const_tensor.size())}, {V.graph.sizevars.size_hints(const_tensor.stride())}, device='{const_tensor.device}', dtype={const_tensor.dtype})" # type: ignore[arg-type] # noqa: B950 line too long + ) + elif isinstance(arg_sig, SizeArg): + symval_hint = V.graph.sizevars.size_hint(arg_sig.expr) + + # Force the seed_offset to be 0 so calls to the same kernel + # using different seed offset will have the same benchmark harness. + # We can dedup kernel definitions in this case. + if "seed_offset" in arg_sig.name: + symval_hint = 0 + result.writeline(f"{var_name} = {symval_hint}") + else: + raise KeyError( + f"Don't find the buffer or const tensor for {arg_name}" + ) + var_names.append(var_name) + result.writeline(f"return {', '.join(var_names)},") + + result.writelines(["\n", "\n", "def call(args):"]) + if grid is None: + grid = [] + extra_args = [] + extra_args_str = None + for tree in self.active_range_trees(): + expr = pexpr(V.graph.sizevars.size_hint(tree.numel)) + extra_args.append(expr) + if tree.prefix != "r": + grid.append(expr) + if self.need_numel_args(): + extra_args_str = ", ".join(map(str, extra_args)) + ", " + else: + extra_args_str = "" + grid_arg = f"{extra_args_str}grid=grid({', '.join(grid)})" + else: + grid_arg = f"grid={grid}" + index = V.graph.scheduler.current_device.index + with result.indent(): + result.writeline(f"with {V.graph.device_ops.device_guard(index)}:") + with result.indent(): + result.writeline( + V.graph.device_ops.set_device(index) + ) # no-op to ensure context + stream_name = f"stream{index}" + result.writeline(f"{stream_name} = get_raw_stream({index})") + result.writeline( + f"{str(Placeholder.KERNEL_NAME)}.run(*args, {grid_arg}, stream={stream_name})" + ) + + # benchmark all configs + result.writelines(["\n", "\n", "def benchmark_all_configs(args):"]) + with result.indent(): + result.writeline(f"with {V.graph.device_ops.device_guard(index)}:") + with result.indent(): + result.writeline( + V.graph.device_ops.set_device(index) + ) # no-op to ensure context + result.writeline( + f"return {str(Placeholder.KERNEL_NAME)}.benchmark_all_configs(*args, {grid_arg})" + ) + + result.writelines(["\n", "\n", "if __name__ == '__main__':"]) + with result.indent(): + result.writeline("from triton.testing import do_bench") + result.writeline("") + + result.writeline("args = get_args()") + result.writeline( + "ms = do_bench(lambda: call(args), rep=40, fast_flush=True)" + ) + result.writeline(f"num_gb = {num_gb}") + result.writeline("gb_per_s = num_gb / (ms / 1e3)") + result.writeline( + 'print(f"{ms:.3f}ms {num_gb:.3f}GB {gb_per_s:.2f}GB/s")' + ) + + return result + + def imports_for_benchmark_kernel(self): + return textwrap.dedent( + """ + from torch._dynamo.testing import rand_strided + {} + import torch + from torch._inductor.triton_heuristics import grid, split_scan_grid + """.format( + V.graph.device_ops.import_get_raw_stream_as("get_raw_stream") + ) + ) + + def estimate_kernel_num_bytes(self): + """ + Try the best to estimate the total size (in bytes) of the + kernel's inputs and outputs, which is used for estimating the memory + throughput of this kernel. This information is used for checking how + far we are from the peak memory bandwidth. It's important that + we want to avoid overestimating the sizes of the inputs and outputs, + because it can wrongfully give us a very large memory traffic value, + which may be even larger than the theoretical bandwidth and thus + become very misleading. This is particularly problematic for cases + where we slice some inputs. In those cases, we should only count + the size of the "slices" instead of the original inputs, because + only the slices contribute to the real memory traffic. + """ + nbytes = [] + ninplace_args = len(unique(self.args.inplace_buffers.values())) + _, call_args, _ = self.args.python_argdefs() + + # For pointwise and reduction kernels, this is the upper-bound numels + # for the output buffer. + # FIXME: This is not exactly right for cases like below: + # def foo(tensor0, tensor1): + # x0 = narrow(tensor0) + # return cat(x0, tensor1) + # For this example, we will end up overestimate the size for the + # slice s0. Potentially, we could have precise inputs information + # if we maintained the original inputs of the Pointwise kernel created + # for the "cat". However, I think it might be a bit overwhelming that + # we add such complexity only for handling some particular cases for + # benchmarking. + out_numel = V.graph.sizevars.size_hint(sympy_product(self.numels)) + for i, arg in enumerate(call_args): + # "buf" may be narrowed. In this case, the number of memory accesses + # should be estimated based on the reinterpreted layout. + # On the other hand, buf may be broadcasted. In this case, + # counting the size of the underline storage would give us + # a better estimation in terms of memory accesses. + if arg not in self.buf_accesses: + nbytes.append(0) + continue + arg_numel = V.graph.get_numel(arg) + buf_size = V.graph.sizevars.size_hint(arg_numel) + if buf_size > out_numel: + # This arg points to a buf that has been sliced. + # We need to count each individual slice to have + # a better estimation. + indices: Set[Any] = set() + no_index_dep_count = 0 + for dep in self.buf_accesses[arg]: + if isinstance(dep, (StarDep, WeakDep)): + indices.add(f"no_index_dep_{no_index_dep_count}") + no_index_dep_count += 1 + else: + indices.add(dep.index) + numel = len(indices) * out_numel + else: + numel = buf_size + dtype = V.graph.get_dtype(arg) + dtype_size = get_dtype_size(dtype) + nbytes.append(numel * dtype_size * (1 + int(i < ninplace_args))) + return sum(nbytes) + + def _get_heuristic(self): + if self.persistent_reduction: + assert self.inside_reduction + return "persistent_reduction" + elif self.inside_reduction: + return "reduction" + return "pointwise" + + def codegen_kernel(self, name=None): + code = IndentedBuffer() + + size_hints = [] + for numel in self.numels: + numel_hint = V.graph.sizevars.symbolic_hint(numel) + if not isinstance(numel_hint, (int, sympy.Integer)): + # This default heuristic hint was picked carefully: it is + # large, to ensure that we don't shrink the block size (since + # if you don't have many elements, it'd be wasteful to pick a + # large block size). Since we don't know how many elements we + # might have, we should be OK with some inefficiency to make + # sure we handle the large case well. 8192 is the largest + # block size we support, so we pick that. + # + # If we have a better hint for unbacked SymInts (e.g., because + # a user told us, or we are tracking upper bounds) we could + # use that here. + size_hint = 8192 + else: + size_hint = next_power_of_2(int(numel_hint)) + size_hints.append(size_hint) + + if not self.inside_reduction: + size_hints.pop() + + heuristics = self._get_heuristic() + + if name is None: + code.splice(gen_common_triton_imports()) + + if config.benchmark_kernel: + code.splice(self.imports_for_benchmark_kernel()) + + argdefs, _, signature = self.args.python_argdefs() + # maps actual expression to SizeArg if it is in sizevars replacements + for i, arg in enumerate(signature): + if isinstance(arg, SizeArg): + # mypy is unhappy about the sympy.Expr + # type for the key of the dict below + symbol = cast(sympy.Symbol, arg.expr) + if symbol in V.graph.sizevars.inv_precomputed_replacements: + signature[i] = SizeArg( + arg.name, V.graph.sizevars.inv_precomputed_replacements[symbol] + ) + + mutated_args = set() + for mutation in self.mutations: + if mutation in self.args.input_buffers: + mutated_args.add(self.args.input_buffers[mutation]) + if ( + mutation in self.args.inplace_buffers + and mutation not in V.graph.removed_buffers + and mutation not in self.removed_buffers + ): + mutated_args.add(self.args.inplace_buffers[mutation].inner_name) + if mutation in self.args.output_buffers: + mutated_args.add(self.args.output_buffers[mutation]) + mutated_args = sorted(mutated_args) + + triton_meta_signature = signature_to_meta( + signature, size_dtype=self.index_dtype + ) + triton_meta = { + "signature": triton_meta_signature, + "device": V.graph.scheduler.current_device.index, + "device_type": V.graph.scheduler.current_device.type, + "constants": {}, + } + + inductor_meta = { + "autotune_hints": set(self.autotune_hints), + "kernel_name": str(Placeholder.DESCRIPTIVE_NAME), + "mutated_arg_names": mutated_args, + "no_x_dim": self.no_x_dim, + "backend_hash": torch.utils._triton.triton_hash_with_backend(), + } + num_gb = None + if config.benchmark_kernel or config.profile_bandwidth: + num_gb = self.estimate_kernel_num_bytes() / 1e9 + inductor_meta["kernel_num_gb"] = num_gb + + for tree in self.active_range_trees(): + sizearg = SizeArg(f"{tree.prefix}numel", tree.numel) + signature.append(sizearg) + triton_meta_signature[len(argdefs)] = signature_of( + sizearg, size_dtype=self.index_dtype + ) + argdefs.append(f"{tree.prefix}numel") + # constexpr version causes issues, see + # https://github.com/pytorch/torchdynamo/pull/1362 + # triton_meta["constants"][len(argdefs)] = V.graph.sizevars.size_hint( + # tree.numel + # ) + # argdefs.append(f"{tree.prefix}numel: tl.constexpr") + triton_meta["configs"] = [config_of(signature)] + + # Triton compiler includes equal_to_1 args into constants even + # when they are not constexpr. otherwise there may be a segfault + # during launching the Inductor-compiled Triton kernel. + # https://github.com/pytorch/pytorch/issues/120478#issuecomment-1962822307 + # https://github.com/openai/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384 + for arg_num in triton_meta["configs"][0].equal_to_1: # type: ignore[index] + triton_meta["constants"][arg_num] = 1 # type: ignore[index] + + self.triton_meta = triton_meta + + for tree in self.range_trees: + if tree.prefix == "r" and self.persistent_reduction: + # RBLOCK for persistent_reduction is defined in codegen_static_numels + continue + if tree.tensor_dim is None: + continue + argdefs.append(f"{tree.prefix.upper()}BLOCK : tl.constexpr") + + self.codegen_body() + + for helper in self.helper_functions: + code.writeline("") + code.splice(helper) + + if self.inside_reduction: + reduction_hint = self.reduction_hint + heuristics_line = f""" + @triton_heuristics.{heuristics}( + size_hints={size_hints!r}, + reduction_hint={reduction_hint}, + filename=__file__, + triton_meta={triton_meta!r}, + inductor_meta={inductor_meta!r} + ) + @triton.jit + """ + else: + tile_hint = "" + if len(size_hints) == 2: + if len(signature) == 4: # input, output and 2 args + tile_hint = "tile_hint=TileHint.SQUARE," + else: + tile_hint = "tile_hint=TileHint.DEFAULT," + heuristics_line = f""" + @triton_heuristics.{heuristics}( + size_hints={size_hints!r}, {tile_hint} + filename=__file__, + triton_meta={triton_meta!r}, + inductor_meta={inductor_meta!r}, + min_elem_per_thread={self.min_elem_per_thread} + ) + @triton.jit + """ + code.splice(heuristics_line) + code.writeline( + f"def {name or str(Placeholder.KERNEL_NAME)}({', '.join(argdefs)}):" + ) + with code.indent(): + self.codegen_static_numels(code) + for old, new in self.args.aliases(): + code.writeline(f"{old} = {new}") + code.splice(self.body) + + if config.benchmark_kernel: + code.splice(self.codegen_kernel_benchmark(num_gb)) + + return code.getvalue() + + def codegen_static_numels(self, code): + """ + We get a small speedup from hard coding numels if they are static. + + This code stomps on the passed-in values by writing an constant to the top of the kernel. + + In a kernel like: + def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): + + We would add + xnumel = 4096 + rnumel = 768 + + After the signature, before the kernel code, if we decided to make these static. As its hardcoded, it becomes + a better signal to triton on how to unroll and do some static indexing. So, it's not so much that downstream + knows that its a static numel, as that you just plop a constant into the kernel. + """ + for tree in self.range_trees: + if tree.prefix != "r" or self.inside_reduction: + simplified_tree_numel = V.graph.sizevars.simplify(tree.numel) + if isinstance(simplified_tree_numel, (sympy.Integer, int)): + code.writeline(f"{tree.prefix}numel = {int(simplified_tree_numel)}") + + if tree.prefix == "r" and self.persistent_reduction: + simplified_tree_numel = V.graph.sizevars.simplify(tree.numel) + if isinstance(simplified_tree_numel, (sympy.Integer, int)): + val = int(simplified_tree_numel) + else: + continue + val = next_power_of_2(val) + code.writeline(f"RBLOCK: tl.constexpr = {val}") + + if tree.prefix == "x" and self.no_x_dim: + code.writeline("XBLOCK: tl.constexpr = 1") + + def triton_tensor_ndim(self): + return sum(int(tree.tensor_dim is not None) for tree in self.range_trees) + + def indexing_size_str(self, i): + sizes = ["None"] * self.triton_tensor_ndim() + sizes[i] = ":" + return f"[{', '.join(sizes)}]" + + def dense_size_list(self) -> List[str]: + sizes = ["1"] * self.triton_tensor_ndim() + for tree in self.range_trees: + if tree.tensor_dim is None: + continue + + if tree.prefix != "r" or self.inside_reduction: + sizes[tree.tensor_dim] = f"{tree.prefix.upper()}BLOCK" + return sizes + + def dense_size_str(self): + sizes = self.dense_size_list() + return f"[{', '.join(sizes)}]" + + def _get_grid_fn(self): + return "grid" + + def add_numel_to_call_args_and_grid(self, name, call_args, grid): + # TODO(jansel): if there are constants, we shouldn't bother passing them as args + for tree in self.range_trees: + if isinstance(tree.numel, (sympy.Integer, sympy.Symbol)): + expr = tree.numel + else: + expr = V.graph.wrapper_code.generate_numel_expr(name, tree) + + if tree.prefix != "r" or self.inside_reduction: + call_args.append(expr) + if tree.grid_dim is not None: + grid.append(expr) + + def get_call_args(self): + _, call_args, _ = self.args.python_argdefs() + # dynamo wraps unspec variable as 0d CPU tensor, need convert to scalar + for i in range(len(call_args)): + if V.graph.is_unspec_arg(call_args[i]): + call_args[i] = call_args[i] + ".item()" + + return call_args + + def call_kernel(self, name: str, node: Optional[IRNode] = None): + wrapper = V.graph.wrapper_code + call_args = self.get_call_args() + grid: List[Any] = [] + self.add_numel_to_call_args_and_grid(name, call_args, grid) + current_device = V.graph.scheduler.current_device + + if self.args.workspace_arg is not None: + ws = self.args.workspace_arg + wrapper.generate_workspace_allocation( + ws.nbytes, current_device, ws.zero_fill + ) + + grid = wrapper.generate_default_grid(name, grid) + wrapper.generate_kernel_call( + name, + call_args, + grid, + current_device.index, + cuda=True, + triton=True, + grid_fn=self._get_grid_fn(), + triton_meta=self.triton_meta, + ) + + if self.args.workspace_arg is not None: + wrapper.writeline(wrapper.make_free_by_names(["workspace"])) + + def codegen_nan_check(self): + wrapper = V.graph.wrapper_code + _, call_args, arg_types = self.args.python_argdefs() + for arg, arg_type in zip(call_args, arg_types): + if isinstance(arg_type, TensorArg): + line = f"assert not {arg}.isnan().any().item()" + wrapper.writeline(line) + line = f"assert not {arg}.isinf().any().item()" + wrapper.writeline(line) + + def warn_mix_layout(self, kernel_name): + """ + Print message if the kernel have mixed layout inputs. + Only care about 4D tensor for now. + """ + if ( + len(self.args.input_buffers) == 1 + and len(self.args.output_buffers) == 1 + and len(self.args.inplace_buffers) == 0 + ): + # even if input buffer and output buffer have different layout, + # this can be a layout conversion kernel. No need to warn for + # the mix layouts. + return + + argdefs, call_args, signature = self.args.python_argdefs() + uniform_stride_order = None + for arg_name in call_args: + buf = V.graph.get_buffer(arg_name) + if buf and len(buf.layout.size) == 4: + # ignore the tensor if only 1 dimension is non-zero + if len([x for x in buf.layout.size if x == 1]) == 3: + continue + stride_order = ir.get_stride_order(buf.layout.stride) + if uniform_stride_order is None: + uniform_stride_order = stride_order + elif uniform_stride_order != stride_order: + msg = yellow_text( + f"Expected stride order {uniform_stride_order}, but found stride order" + + f" {stride_order} for kernel {kernel_name}" + ) + log.warning(msg) + + stride_order_list = [ + ir.get_stride_order(V.graph.get_buffer(name).layout.stride) + if V.graph.get_buffer(name) + else None + for name in call_args + ] + size_list = [ + V.graph.get_buffer(name).layout.size + if V.graph.get_buffer(name) + else None + for name in call_args + ] + source_list = [ + "GraphInput" + if name in V.graph.graph_inputs + else "IntermediateBuffer" + if name in V.graph.name_to_buffer + else None + for name in call_args + ] + + msg = yellow_text( + f" param names {argdefs}\n buf names {call_args}\n strides {stride_order_list}" + + f"\n sizes {size_list}\n sources {source_list}\n" + ) + log.warning(msg) + return + msg = green_text( + f"All the inputs for the triton kernel {kernel_name} have uniform layout" + ) + log.warning(msg) + + def create_cse_var(self, *args, **kwargs): + return TritonCSEVariable(*args, **kwargs) + + +class TritonScheduling(BaseScheduling): + def __init__(self, scheduler): + self.scheduler = scheduler + + def group_fn(self, sizes): + return tuple(V.graph.sizevars.simplify(sympy_product(s)) for s in sizes) + + def can_fuse(self, node1, node2): + """ + Hook called by Scheduler to determine if the Triton backend + can fuse node1 and node2. These nodes might already be + FusedSchedulerNodes. + """ + if isinstance(node1, scheduler.ForeachKernelSchedulerNode) or isinstance( + node2, scheduler.ForeachKernelSchedulerNode + ): + return scheduler.ForeachKernelSchedulerNode.can_fuse(node1, node2) + + _, (numel1, rnumel1) = node1.group + _, (numel2, rnumel2) = node2.group + why = WhyNoFuse(node1, node2) + + if node1.is_split_scan() and not node2.is_split_scan(): + if node2.is_reduction(): + why("Split scan cannot fuse with reductions") + elif node2.is_split_scan() and not node1.is_split_scan(): + if node1.is_reduction(): + why("Split scan cannot fuse with reductions") + + if node1.is_reduction() and node2.is_reduction(): + reduction_can_fuse = numel1 == numel2 and rnumel1 == rnumel2 + if not reduction_can_fuse: + why( + "numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)", + numel1, + numel2, + rnumel1, + rnumel2, + ) + return reduction_can_fuse + + if not node1.is_reduction() and not node2.is_reduction(): + if not (numel1 == numel2 and rnumel1 == rnumel2): + why( + "numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)", + numel1, + numel2, + rnumel1, + rnumel2, + ) + return False + + if node1.is_template(): + # Only allow fusion for TritonTemplates for now. + # Fusion for CUDATemplates are not supported. + is_triton_template = isinstance(node1.node, TritonTemplateBuffer) + if not is_triton_template: + why("node1 is not TritonTemplateBuffer") + return is_triton_template + + # check for a bad combined tiling + tiling1 = self.select_tiling(node1.get_nodes(), numel1, rnumel1) + tiling2 = self.select_tiling(node2.get_nodes(), numel1, rnumel1) + tiling3 = self.select_tiling( + node1.get_nodes() + node2.get_nodes(), numel1, rnumel1 + ) + if config.triton.tiling_prevents_pointwise_fusion: + cond = True + if len(tiling1) > 2: + if len(tiling2) > 2: + cond = tiling1 == tiling2 == tiling3 + else: + cond = tiling1 == tiling3 + elif len(tiling2) > 2: + cond = tiling2 == tiling3 + if not cond: + why( + "tiling mismatch (%s, %s, %s)", + tiling1, + tiling2, + tiling3, + ) + return False + + return True + + if not node1.is_reduction() and node2.is_reduction(): + assert rnumel1 == 1 and rnumel2 != 1 + if numel1 == numel2 * rnumel2: + if not all( + TritonKernel.is_compatible((numel2, rnumel2), n.get_ranges()) + for n in node1.get_nodes() + ): + why("nodes numel/rnumel incompatibility") + return False + if ( + config.triton.tiling_prevents_reduction_fusion + and not node1.is_template() + ): + is_reduction_tiling_valid = self.select_tiling( + node1.get_nodes(), numel1 + ) in ( + (numel1, 1), + (numel2, rnumel2, 1), + ) + if not is_reduction_tiling_valid: + why("invalid tiling for reduction") + return is_reduction_tiling_valid + return True + + if numel1 != numel2: + why("nodes numel incompatibility") + return numel1 == numel2 + + assert node1.is_reduction() and not node2.is_reduction() + # swap args to hit the case above + return self.can_fuse_horizontal(node2, node1) + + can_fuse_vertical = can_fuse + can_fuse_horizontal = can_fuse + + def generate_node_schedule(self, nodes, numel, rnumel): + node_schedule: List[Any] = [] + current_loop_writes: Set[str] = set() + + # Writes with a reduced shape, meaning they are only present once the + # reduction loop has ended + current_loop_reduced_writes = set() + current_loop_has_writes = False + done = set() + + def fits_in_main_body(n): + _, (node_numel, node_rnumel) = n.group + return (node_numel == numel and node_rnumel == rnumel) or ( + node_numel == numel * rnumel and node_rnumel == 1 + ) + + def fits_outside_reduction(n): + _, (node_numel, node_rnumel) = n.group + return node_numel == numel and node_rnumel == 1 and rnumel != 1 + + def schedule_node_in_loop(n): + nonlocal current_loop_has_writes + done.add(n) + node_schedule.append(n) + current_loop_has_writes = True + # A scan is modelled as a reduction in the scheduler but has a + # full sized output that can be used inside the loop body + if ( + n.is_reduction() + and isinstance(n, scheduler.SchedulerNode) + and isinstance(n.node, ir.ComputedBuffer) + and not isinstance(n.node.data, ir.Scan) + ): + current_loop_reduced_writes.add(n.get_name()) + + @contextlib.contextmanager + def end_current_reduction_loop(): + nonlocal current_loop_has_writes + if current_loop_has_writes: + # flush out any other runnable nodes to reduce number of loops + for other_node in nodes[index + 1 :]: + if ( + node not in done + and fits_in_main_body(other_node) + and not (current_loop_reduced_writes & other_node.ancestors) + ): + schedule_node_in_loop(node) + + if node_schedule and node_schedule[-1] is EnableReduction: + node_schedule.pop() + else: + node_schedule.append(DisableReduction) + yield + node_schedule.append(EnableReduction) + current_loop_reduced_writes.clear() + current_loop_has_writes = False + + for index, node in enumerate(nodes): + if node in done: + continue + done.add(node) + + def requires_closing_previous_reduction(node, node_schedule): + if rnumel == 1: + return False + if not current_loop_reduced_writes & node.ancestors: + return False + assert node_schedule and not isinstance( + node_schedule[-1], (EnableReduction, DisableReduction) + ) + return bool(current_loop_reduced_writes) + + if fits_in_main_body(node): + if requires_closing_previous_reduction(node, node_schedule): + with end_current_reduction_loop(): + pass # need to start a new reduction loop + + schedule_node_in_loop(node) + elif fits_outside_reduction(node): + with end_current_reduction_loop(): + node_schedule.append(node) + else: + raise NotImplementedError( + f"unexpected group: ({numel}, {rnumel}) != {node.group[1]}" + ) + + return node_schedule + + def codegen_nodes(self, nodes: List[scheduler.SchedulerNode]): + """ + Given a set of pre-fused nodes, generate a Triton kernel. + """ + _, (numel, rnumel) = max(nodes, key=lambda x: int(x.is_reduction())).group + + node_schedule = self.generate_node_schedule(nodes, numel, rnumel) + buf_accesses = collections.defaultdict(list) + for node in nodes: + for access in node.read_writes.reads | node.read_writes.writes: + buf_accesses[access.name].append(access) + + schedule_log.debug("Schedule:\n %s", node_schedule) + + return self.codegen_node_schedule(node_schedule, buf_accesses, numel, rnumel) + + @staticmethod + def reduction_hint(node): + assert node.is_reduction() + if all( + dep.is_contiguous() + for dep in itertools.chain(node.read_writes.reads, node.read_writes.writes) + ): + return ReductionHint.INNER + else: + return node.node.data.reduction_hint + + @staticmethod + def can_use_32bit_indexing( + numel: sympy.Expr, buffers: Iterable[Union[ir.Buffer, ir.TensorBox]] + ) -> bool: + int_max = torch.iinfo(torch.int32).max + size_hint = V.graph.sizevars.size_hint + has_hint = V.graph.sizevars.shape_env.has_hint + + def within_32bit(e): + # Allow for unhinted e as long as we can still statically prove + # (e.g., via ValueRanges) that it is still in bounds + if V.graph.sizevars.is_expr_static_and_true(e <= int_max): + return True + # Otherwise, the hint MUST exist and be in range + return has_hint(e) and size_hint(e) <= int_max + + if not within_32bit(numel): + return False + + # Any use of a MultiOutputLayout will create a buffer with a + # Layout whose sizes are accounted for + buf_sizes = [ + buf.get_layout().storage_size() + for buf in buffers + if not isinstance(buf.get_layout(), ir.MultiOutputLayout) + ] + + if not all(within_32bit(size) for size in buf_sizes): + return False + + # Only install guards for 32-bit indexing as there is no correctness + # issue with using 64-bit for everything + V.graph.sizevars.guard_leq(numel, int_max) # type: ignore[arg-type] + for size in buf_sizes: + V.graph.sizevars.guard_leq(size, int_max) # type: ignore[arg-type] + return True + + @staticmethod + def select_index_dtype(node_schedule, numel, reduction_numel): + # Gather all used buffer names + buffer_names = set() + for node in node_schedule: + if not isinstance(node, scheduler.BaseSchedulerNode): + continue + + buffer_names.update(node.get_names()) + buffer_names.update(node.used_buffer_names()) + + # Get buffers objects + def _get_buffer(name: str) -> Union[ir.Buffer, ir.TensorBox]: + if name in V.graph.name_to_buffer: + return V.graph.name_to_buffer[name] + elif name in V.graph.graph_inputs: + return V.graph.graph_inputs[name] + elif name in V.graph.constants: + data = V.graph.constants[name] + return ir.ConstantBuffer( + name, + ir.FixedLayout( + data.device, data.dtype, *V.graph.static_sizes_strides(data) + ), + ) + raise RuntimeError(f"Failed to find buffer matching name {name}") + + buffers = [_get_buffer(name) for name in buffer_names] + + # In theory we can separately check xnumel and rnumel are <= int_max + # but some indexers do use the full linear index so we need to be + # conservative here. + total_numel = numel * reduction_numel + + if TritonScheduling.can_use_32bit_indexing(total_numel, buffers): + return "tl.int32" + return "tl.int64" + + def get_kernel_args(self, node_schedule, numel, reduction_numel): + reductions = list( + filter( + lambda n: n not in (EnableReduction, DisableReduction) + and n.is_reduction(), + node_schedule, + ) + ) + if len(reductions) > 0: + hints = [self.reduction_hint(n) for n in reductions] + if hints.count(hints[0]) == len(hints): + reduction_hint_val = hints[0] + else: + reduction_hint_val = ReductionHint.DEFAULT + else: + reduction_hint_val = ReductionHint.DEFAULT + + mutations = set() + for node in node_schedule: + if hasattr(node, "get_mutations"): + mutations.update(node.get_mutations()) + + index_dtype = self.select_index_dtype(node_schedule, numel, reduction_numel) + + return reduction_hint_val, mutations, index_dtype + + def codegen_comment(self, node_schedule): + wrapper = V.graph.wrapper_code + origins, detailed_origins = get_kernel_metadata(node_schedule, wrapper) + if origins: + wrapper.writeline(origins) + + if config.debug_fusion: + from torch._inductor.scheduler import ( + BaseSchedulerNode, + ForeachKernelSchedulerNode, + ) + + if not any( + isinstance(n, ForeachKernelSchedulerNode) for n in node_schedule + ): + # We probably should look what are the nodes inside a foreach + # schedule node + node_names = [ + n.get_name() + for n in node_schedule + if isinstance(n, BaseSchedulerNode) + ] + wrapper.writeline( + f"{wrapper.comment} Fused node name list: {', '.join(node_names)}" + ) + + def codegen_node_schedule( + self, node_schedule, buf_accesses, numel, reduction_numel + ): + from torch._inductor.codegen.triton_split_scan import TritonSplitScanKernel + + tiled_groups = self.select_tiling(node_schedule, numel, reduction_numel) + reduction_hint_val, mutations, index_dtype = self.get_kernel_args( + node_schedule, numel, reduction_numel + ) + + is_split_scan = any( + isinstance(node, BaseSchedulerNode) and node.is_split_scan() + for node in node_schedule + ) + kernel_type = TritonSplitScanKernel if is_split_scan else TritonKernel + kernel_args = tiled_groups + kernel_kwargs = { + "reduction_hint": reduction_hint_val, + "mutations": mutations, + "index_dtype": index_dtype, + } + kernel = kernel_type( + *kernel_args, + **kernel_kwargs, + ) + kernel.buf_accesses = buf_accesses + + self.codegen_node_schedule_with_kernel(node_schedule, kernel) + + with V.set_kernel_handler(kernel): + src_code = kernel.codegen_kernel() + + kernel_name = self.define_kernel(src_code, node_schedule) + log.debug("Generating kernel code with kernel_name: %s", kernel_name) + kernel.kernel_name = kernel_name + kernel.code_hash = code_hash(src_code) + + if kernel.persistent_reduction and config.triton.multi_kernel: + kernel2 = TritonKernel( + *kernel_args, + **kernel_kwargs, + disable_persistent_reduction=True, + ) + self.codegen_node_schedule_with_kernel(node_schedule, kernel2) + with V.set_kernel_handler(kernel2): + src_code2 = kernel2.codegen_kernel() + kernel_name2 = self.define_kernel(src_code2, node_schedule) + kernel2.kernel_name = kernel_name2 + kernel2.code_hash = code_hash(src_code2) + + final_kernel = MultiKernel([kernel, kernel2]) + else: + final_kernel = kernel # type: ignore[assignment] + + with V.set_kernel_handler(final_kernel): + for node in node_schedule: + if node not in (EnableReduction, DisableReduction): + node.mark_run() + + self.codegen_comment(node_schedule) + final_kernel.call_kernel(final_kernel.kernel_name) + if config.nan_asserts: + final_kernel.codegen_nan_check() + if config.warn_mix_layout: + final_kernel.warn_mix_layout(kernel_name) + + V.graph.removed_buffers |= final_kernel.removed_buffers + V.graph.inplaced_to_remove |= final_kernel.inplaced_to_remove + + if ( + V.graph.wrapper_code.supports_intermediate_hooks + and config.generate_intermediate_hooks + ): + # Not every node in the schedule will actually be live on output; + # we can't check dead buffers. + live_outs = kernel.args.live_output_buffers() + for node in node_schedule: + if not isinstance(node, scheduler.BaseSchedulerNode): + continue + name = node.get_name() + if name not in live_outs: + continue + origin_node = node.node.get_origin_node() + if origin_node is not None: + counters["inductor"]["intermediate_hooks"] += 1 + V.graph.wrapper_code.writeline( + f"run_intermediate_hooks({origin_node.name!r}, {name})" + ) + + self.scheduler.free_buffers() + + def codegen_node_schedule_with_kernel(self, node_schedule, kernel): + def current_reduction_nodes(nodes): + return itertools.takewhile(lambda n: n is not DisableReduction, nodes) + + with kernel: + stack = contextlib.ExitStack() + kernel.set_last_usage(current_reduction_nodes(node_schedule)) + + for node in node_schedule: + if node not in (EnableReduction, DisableReduction): + node.decide_inplace_update() + for i, node in enumerate(node_schedule): + if node is DisableReduction: + stack.enter_context(kernel.disable_reduction()) + elif node is EnableReduction: + stack.close() + kernel.set_last_usage(current_reduction_nodes(node_schedule[i:])) + else: + # TODO - use split ranges ? + indexing_dtype_strength_reduction(node._body) + index_vars = kernel.split_and_set_ranges(node.get_ranges()) + node.codegen(index_vars) + + def define_kernel(self, src_code, node_schedule): + wrapper = V.graph.wrapper_code + if src_code in wrapper.src_to_kernel: + kernel_name = wrapper.src_to_kernel[src_code] + else: + fused_name = ( + get_fused_kernel_name(node_schedule, config.triton.descriptive_names) + if config.triton.descriptive_names + else "" + ) + kernel_category = get_kernel_category_by_source_code(src_code)[:3] + kernel_name = "_".join( + ["triton", kernel_category, fused_name, wrapper.next_kernel_suffix()] + ) + # use the original src_code as the key + wrapper.src_to_kernel[src_code] = kernel_name + subs_name = kernel_name if config.triton.unique_kernel_names else "triton_" + + # DESCRIPTIVE_NAME is used for profiling purposes; it shows the full kernel name + # even when unique_kernel_names is turned off. Meanwhile, KERNEL_NAME is sometimes set + # to "triton_" to maximize caching opportunities (when unique_kernel_names = False). + src_code = src_code.replace(str(Placeholder.DESCRIPTIVE_NAME), kernel_name) + src_code = src_code.replace(str(Placeholder.KERNEL_NAME), subs_name) + + # TODO(voz): Ostensibly, we should not need this. But there are cases where C++ codegen does + # not use BracesBuffer, so we have no good indicator of a C++ buffer atm. + src_code = src_code.replace("#pragma CMT", "#") + + basename, _, kernel_path = get_path(code_hash(src_code.strip()), "py") + + compile_wrapper = IndentedBuffer() + compile_wrapper.writeline(f"async_compile.triton({subs_name!r}, '''") + compile_wrapper.splice(src_code, strip=True) + compile_wrapper.writeline( + f"''', device_str='{V.graph.scheduler.current_device.type}')" + ) + + metadata_comment = f"# kernel path: {kernel_path}" + origins, detailed_origins = get_kernel_metadata(node_schedule, wrapper) + metadata_comment += "\n" + origins + "\n" + detailed_origins + wrapper.define_kernel( + kernel_name, compile_wrapper.getvalue(), metadata_comment + ) + + # log kernel metadata for offline analysis. + # E.g. one can find all unaligned inner reduction and check if + # padding helps with the perf kernel by kernel. + if is_metric_table_enabled("kernel_metadata"): + log_kernel_metadata(kernel_name, kernel_path, src_code) + + return kernel_name + + def codegen_template( + self, template_node, epilogue_nodes, only_gen_src_code=False + ) -> Optional[str]: + """ + Codegen a triton template + + If `only_gen_src_code` the src code will be returned instead of codegen'd into the wrapper + """ + _, (numel, rnumel) = template_node.group + assert rnumel == 1 + kernel, render = template_node.node.make_kernel_render(template_node.node) + with kernel: + if not only_gen_src_code: + for node in [template_node, *epilogue_nodes]: + node.mark_run() + partial_code = render() + for node in epilogue_nodes: + node.codegen(kernel.split_and_set_ranges(node.get_ranges())) + + # finalize must be called after adding epilogue above + with V.set_kernel_handler(kernel): + # TODO: Maybe unify CUDATemplateKernel to also use PartialRender for flexible epilogue fusion. + src_code = ( + partial_code + if isinstance(partial_code, str) + else partial_code.finalize() + ) + node_schedule = [template_node, *epilogue_nodes] + + if config.benchmark_kernel: + num_gb = kernel.estimate_kernel_num_bytes() / 1e9 + grid_args = V.graph.sizevars.size_hints(kernel.call_sizes) + assert kernel.meta is not None, "meta is None" + grid = kernel.grid_fn(*grid_args, kernel.meta) + src_code = ( + f"{kernel.imports_for_benchmark_kernel()}\n" + f"{src_code}\n" + f"{kernel.codegen_kernel_benchmark(num_gb, grid).getvalue()}" + ) + + if only_gen_src_code: + return src_code + + kernel_name = self.define_kernel(src_code, node_schedule) + + self.codegen_comment(node_schedule) + kernel.call_kernel(kernel_name, template_node.node) + V.graph.removed_buffers |= kernel.removed_buffers + V.graph.inplaced_to_remove |= kernel.inplaced_to_remove + self.scheduler.free_buffers() + return None + + def codegen_sync(self): + V.graph.wrapper_code.writeline(V.graph.device_ops.synchronize()) + + def codegen_foreach(self, foreach_node): + from .triton_foreach import ForeachKernel + + for partitions_with_metadata in ForeachKernel.horizontal_partition( + foreach_node.get_subkernel_nodes(), self + ): + kernel = ForeachKernel() + for nodes, tiled_groups, numel, rnumel in partitions_with_metadata: + node_schedule = self.generate_node_schedule(nodes, numel, rnumel) + ( + reduction_hint_val, + mutations, + index_dtype, + ) = self.get_kernel_args(node_schedule, numel, rnumel) + + subkernel = kernel.create_sub_kernel( + *tiled_groups, + reduction_hint=reduction_hint_val, + mutations=mutations, + index_dtype=index_dtype, + ) + + self.codegen_node_schedule_with_kernel( + node_schedule, + subkernel, + ) + + with V.set_kernel_handler(subkernel): + for node in node_schedule: + if node not in (EnableReduction, DisableReduction): + node.mark_run() + V.graph.removed_buffers |= subkernel.removed_buffers + V.graph.inplaced_to_remove |= subkernel.inplaced_to_remove + + src_code = kernel.codegen_kernel() + kernel_name = self.define_kernel(src_code, [foreach_node]) + self.codegen_comment([foreach_node]) + kernel.call_kernel(V.graph.wrapper_code, kernel_name) + + self.scheduler.free_buffers() + + @staticmethod + @functools.lru_cache(32) + def candidate_tilings(node): + ranges, reduction_ranges = node.get_ranges() + if len(ranges) <= 1: + return () + + rw = node.pointwise_read_writes() + assert len(rw.range_vars) == len(ranges) + + # isinstance(dep, MemoryDep): this filters out StarDeps. StarDeps refer to reads + # that need to access the entire tensor; they don't contribute read indexing + # information (and practically, they don't have dep.index so they can't be used + # for stride_hints below + dep_sources = [rw.reads, rw.writes] + assert all( + isinstance(dep, (MemoryDep, StarDep)) + for dep in itertools.chain.from_iterable(dep_sources) + ) + deps = [ + dep + for dep in itertools.chain.from_iterable(dep_sources) + if dep.name not in V.graph.removed_buffers and isinstance(dep, MemoryDep) + ] + write_names = {dep.name for dep in rw.writes} + + tilings: List[CandidateTiling] = [] + + for dep in deps: + strides = V.graph.sizevars.stride_hints(dep.index, rw.range_vars) + assert len(strides) == len(ranges) + try: + split = strides.index(1) + 1 + if split == len(ranges): + continue + if all(s == 0 for s in strides[split:]): + # if this is a broadcasted tensor and all dimensions after split are broadcast, + # this is not a real split + continue + + except ValueError: + continue + tiled_groups = ( + V.graph.sizevars.simplify(sympy_product(ranges[:split])), + V.graph.sizevars.simplify(sympy_product(ranges[split:])), + ) + # score by number of elements + score = V.graph.sizevars.size_hint( + sympy_product( + size for size, stride in zip(ranges, strides) if stride != 0 + ) + ) + if dep.name in write_names: + # ngimel said contiguous writes is more important than reads + score *= 2 + if CandidateTiling.is_good_size(tiled_groups[0]): + score *= 2 + if CandidateTiling.is_good_size(tiled_groups[1]): + score *= 2 + + if ( + V.graph.sizevars.size_hint( + score - sympy_product(itertools.chain(ranges, reduction_ranges)) + ) + >= 0 + ): + tilings.append(CandidateTiling(tiled_groups, score, dep.name)) + return tilings + + @classmethod + def select_tiling(cls, node_schedule, numel, reduction_numel=sympy.Integer(1)): + """ + Heuristics to decide how to tile kernels. + Currently, we tile based on stride-1 dimensions. + + Returns: + `(tile1, tile2, reduction_numel)` s.t. `tile1 * tile2 == numel` + + """ + if reduction_numel != 1 or config.triton.max_tiles <= 1: + # TODO(jansel): should we tile reductions? + # do perf hint here if stride-1 dim is not being reduced + if perf_hint_log.level <= logging.WARNING: + for node in EnableReduction.filter(node_schedule): + if len(cls.candidate_tilings(node)) > 0: + perf_hint_log.info("reduction over non-contiguous dims") + break + return (numel, reduction_numel) + + seen_names = set() + candidate_tiles: Counter[Any] = collections.Counter() + for node in EnableReduction.filter(node_schedule): + for tiling in cls.candidate_tilings(node): + if tiling.name in seen_names: + continue + seen_names.add(tiling.name) + candidate_tiles[tiling.tiling] += tiling.score + + ranked_tilings = [tiling for tiling, score in candidate_tiles.most_common()] + + if config.triton.max_tiles >= 3: + # Consider adding a third dimension of tiling, but only + # when a1 is a multiple of b1; otherwise, you have a lot + # of stragglers which is annoying to generate code for. + # + # NB: More than three max tiles is not enabled by default. + + # Add one 3D tiling choice + for i in range(1, len(ranked_tilings)): + a0, a1 = ranked_tilings[0] + b0, b1 = ranked_tilings[i] + if V.graph.sizevars.size_hint(a1 - b1) == 0: + continue + if V.graph.sizevars.size_hint(a1 - b1) < 0: + # swap so a0 is bigger + a0, a1 = ranked_tilings[i] + b0, b1 = ranked_tilings[0] + assert V.graph.sizevars.size_hint(a1 - b1) > 0 + if V.graph.sizevars.statically_known_multiple_of(a1, b1): + tiling = (a0, FloorDiv(a1, b1), b1) + ranked_tilings = [tiling] + ranked_tilings + break # only 1 choice for now + + if len(ranked_tilings) > 1: + perf_hint_log.info("possibly bad tiling: %s", ranked_tilings) + + for tiled_groups in ranked_tilings: + new_groups = (*tiled_groups, reduction_numel) + if all( + TritonKernel.is_compatible(new_groups, node.get_ranges()) + for node in node_schedule + if isinstance(node, scheduler.SchedulerNode) + ): + return new_groups + + return (numel, reduction_numel) + + def flush(self): + pass + + def ready_to_flush(self) -> bool: + return False + + def benchmark_fused_nodes(self, nodes): + # empty last_usage. May cause more aggressive 'evict_last'. Should be fine. + for n in nodes: + n.last_usage = set() + + if not nodes[0].is_template(): + _, (numel, rnumel) = max(nodes, key=lambda x: int(x.is_reduction())).group + node_schedule = self.generate_node_schedule(nodes, numel, rnumel) + + tiled_groups = self.select_tiling(node_schedule, numel, rnumel) + reduction_hint_val, mutations, index_dtype = self.get_kernel_args( + node_schedule, numel, rnumel + ) + + kernel = TritonKernel( + *tiled_groups, + reduction_hint=reduction_hint_val, + mutations=mutations, + index_dtype=index_dtype, + ) + + self.codegen_node_schedule_with_kernel(node_schedule, kernel) + with config.patch("benchmark_kernel", True), V.set_kernel_handler(kernel): + src_code = kernel.codegen_kernel() + else: + template_node = nodes[0] + epilogue_nodes = nodes[1:] + + with config.patch("benchmark_kernel", True): + src_code = self.codegen_template( + template_node, epilogue_nodes, only_gen_src_code=True + ) + + src_code = src_code.replace(str(Placeholder.KERNEL_NAME), "triton_") + mod = PyCodeCache.load(src_code) + + def cache_file_path(): + assert mod.__file__ is not None + return os.path.splitext(mod.__file__)[0] + ".kernel_perf" + + def load_cache(): + path = cache_file_path() + if os.path.exists(path): + with open(path) as fd: + return float(fd.read()) + return None + + def store_cache(): + path = cache_file_path() + with open(path, "w") as fd: + fd.write(str(ms)) + + log.debug( + "kernel src code for %s written to: %s", + {n.get_name() for n in nodes}, + mod.__file__, + ) + ms = load_cache() + if ms is not None: + return ms, mod.__file__ + + args = mod.get_args() + call = mod.call + wrapped_jit_function = mod.triton_ + + # call once to trigger the compilation + call(wrapped_jit_function.clone_args(*args)[0]) + + launchers = wrapped_jit_function.launchers + assert len(launchers) == 1 + if launchers[0].n_spills > 0: + # skip benchmarking the kernel if there are register spills + ms = float("inf") + else: + # We have to clone the inplace updated arguments to avoid earlier calls + # generating out of range indices for later calls. + ms = do_bench(lambda: call(wrapped_jit_function.clone_args(*args)[0])) + + log.debug( + "The fused kernel for %s took %.3f ms to run", + {n.get_name() for n in nodes}, + ms, + ) + store_cache() + return ms, mod.__file__ + + +@dataclasses.dataclass +class CandidateTiling: + tiling: Tuple[sympy.Expr, sympy.Expr] + score: int # higher is better + name: Optional[str] = None + + @staticmethod + def is_good_size(s): + """Somewhat arbitrary heuristic used to boost scores for some sizes""" + s = V.graph.sizevars.size_hint(s) + return s >= 32 and (s % 32 == 0) + + +class DisableReduction: + """ + Marker to invoke `kernel.disable_reduction()`. This closes a + reduction loop and allows for pointwise ops to occur on the output + of a reduction. + """ + + +class EnableReduction: + """ + Marker to end a DisableReduction block. + """ + + @staticmethod + def filter(node_schedule): + """ + Get the nodes from node_schedule skipping those in a + DisableReduction block. + """ + disabled = False + for node in node_schedule: + if node in (EnableReduction, DisableReduction): + # Don't tile stuff outside the main reduction loop + disabled = node is DisableReduction + elif disabled: + pass + else: + yield node + + +class CantSplit(Exception): + pass diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/decompose_mem_bound_mm.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/decompose_mem_bound_mm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e153527cbf4a19a5b12dca1bdd60aa44c663df5a Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/decompose_mem_bound_mm.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/fuse_attention.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/fuse_attention.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..17ae46cd458107f8aba65c20073bcd1e304c8ea5 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/fuse_attention.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pre_grad.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pre_grad.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5a6ae6f2c585d3cf76b0cdaefe65859cb7abdf04 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pre_grad.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/quantization.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/quantization.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d60d520dae0946fbb2f57c45f87fd7e3257a7197 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/quantization.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/split_cat.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/split_cat.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8defda6cab2941861c8f65692de158d395b20787 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/split_cat.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_1.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_1.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b11aa9b0c8f12d918a28a63c7c9ab2a8a7c7172 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_1.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_11.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_11.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b4a472904d764b7befe3e9e1c14e8189988bd056 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_11.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_15.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_15.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9ba536fc968a397f2b2158201e629a57c488322 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_15.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_16.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_16.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d04bc3a87a8f98568c0cf2e3d06c0959d5de0c7f Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_16.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_2.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_2.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..95da17d3331c721746fed9ab37f8e4041cfb41f8 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_2.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_4.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_4.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..81c2a1d20391c204decfdc6ac591f70e2f57f5d4 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_4.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_5.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_5.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c677c770db9f5006a36013799dd7370a8c49b034 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_5.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_7.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_7.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..19d77fddf7a509aecd317166ea57ff04845cd1ef Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_7.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_9.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_9.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..39345b119bff177b95a46b1fd3fe26b3331cebfa Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_9.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_14.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_14.py new file mode 100644 index 0000000000000000000000000000000000000000..b4271e0c5ee50c533d86b50be243a8efa41c78a4 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_14.py @@ -0,0 +1,218 @@ +# mypy: ignore-errors + +# noqa: F401, E501 +# This is an auto-generated file. Please do not modify it by hand. +# To re-generate, run: +# cd ~/pytorch && python +# torchgen/fuse_attention_patterns/gen_attention_patterns.py + +import torch +import torch._inductor + +aten = torch.ops.aten +prims = torch.ops.prims + +from torch._inductor.pattern_matcher import ( + Arg, + CallFunction, + CallFunctionVarArgs, + CallMethod, + CallMethodVarArgs, + CallModule, + CallModuleVarArgs, + ExclusiveKeywordArg, + Ignored, + KeywordArg, + ListOf, + MultiOutputPattern, + PatternExpr, + RepeatedExpr, + _TargetArgsExpr, + _TargetExpr, + _TargetExprVarArgs, +) +permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored()) +expand_default = CallFunction(aten.expand.default, permute_default, Ignored()) +clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format) +view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2) +permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored()) +permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored()) +expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored()) +clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format) +view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2) +bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1) +view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored()) +div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale')) +add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2) +amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True) +sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default) +exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2) +sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True) +div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2) +expand_default_2 = CallFunction(aten.expand.default, div_Tensor_1, Ignored()) +view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2) +permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored()) +expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored()) +clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format) +view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2) +bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4) +view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored()) +view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2) +permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored()) +bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4) +view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored()) +alias_default = CallFunction(aten.alias.default, div_Tensor_1) +alias_default_1 = CallFunction(aten.alias.default, alias_default) +alias_default_2 = CallFunction(aten.alias.default, alias_default_1) +alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2) +mul_Tensor = CallFunction(aten.mul.Tensor, view_default_7, alias_default_3, _users=2) +sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True) +mul_Tensor_1 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1) +sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1) +div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, KeywordArg('inv_scale')) +view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2) +permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored()) +bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5) +view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored()) +permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored()) +permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored()) +bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8) +view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored()) +permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored()) +permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored()) +permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored()) +bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6) +view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored()) +permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored()) +_sfdp_pattern_14_training = MultiOutputPattern([view_default_5, + permute_default_6, + permute_default_9, + permute_default_11, + None, + None +]) + + +permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored()) +expand_default = CallFunction(aten.expand.default, permute_default, Ignored()) +clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format) +view_default = CallFunction(aten.view.default, clone_default, Ignored()) +permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored()) +permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored()) +expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored()) +clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format) +view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored()) +bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1) +view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored()) +div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale')) +add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2) +amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True) +sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default) +exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2) +sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True) +div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList) +expand_default_2 = CallFunction(aten.expand.default, div_Tensor_1, Ignored()) +view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored()) +permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored()) +expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored()) +clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format) +view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored()) +bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4) +_sfdp_pattern_14_inference = CallFunction(aten.view.default, bmm_default_1, Ignored()) + + +permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored()) +expand_default = CallFunction(aten.expand.default, permute_default, Ignored()) +clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format) +view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2) +permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored()) +permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored()) +expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored()) +clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format) +view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2) +bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1) +view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored()) +div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale')) +add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask')) +convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored(), _users=2) +amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True) +sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default) +exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2) +sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True) +div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList) +convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2) +expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored()) +view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2) +permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored()) +expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored()) +clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format) +view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2) +bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4) +view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored()) +view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2) +permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored()) +bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4) +view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored()) +convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored()) +alias_default = CallFunction(aten.alias.default, convert_element_type_default_1) +alias_default_1 = CallFunction(aten.alias.default, alias_default) +alias_default_2 = CallFunction(aten.alias.default, alias_default_1) +alias_default_3 = CallFunction(aten.alias.default, alias_default_2) +convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2) +mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_2, convert_element_type_default_3, _users=2) +sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True) +mul_Tensor_1 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, sum_dim_IntList_1) +sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1) +convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored()) +div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_4, KeywordArg('inv_scale')) +view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2) +permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored()) +bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5) +view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored()) +permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored()) +permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored()) +bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8) +view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored()) +permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored()) +permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored()) +permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored()) +bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6) +view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored()) +permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored()) +_sfdp_pattern_14_half_training = MultiOutputPattern([view_default_5, + permute_default_6, + permute_default_9, + permute_default_11, + None, + None +]) + + +permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored()) +expand_default = CallFunction(aten.expand.default, permute_default, Ignored()) +clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format) +view_default = CallFunction(aten.view.default, clone_default, Ignored()) +permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored()) +permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored()) +expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored()) +clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format) +view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored()) +bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1) +view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored()) +div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale')) +add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask')) +convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored(), _users=2) +amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True) +sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default) +exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2) +sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True) +div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList) +convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored()) +expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored()) +view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored()) +permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored()) +expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored()) +clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format) +view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored()) +bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4) +_sfdp_pattern_14_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored()) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_7.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_7.py new file mode 100644 index 0000000000000000000000000000000000000000..e921a07d46774fcd28e416b1043f12138727abd7 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_7.py @@ -0,0 +1,233 @@ +# mypy: ignore-errors + +# noqa: F401, E501 +# This is an auto-generated file. Please do not modify it by hand. +# To re-generate, run: +# cd ~/pytorch && python +# torchgen/fuse_attention_patterns/gen_attention_patterns.py + +import torch +import torch._inductor + +aten = torch.ops.aten +prims = torch.ops.prims + +from torch._inductor.pattern_matcher import ( + Arg, + CallFunction, + CallFunctionVarArgs, + CallMethod, + CallMethodVarArgs, + CallModule, + CallModuleVarArgs, + ExclusiveKeywordArg, + Ignored, + KeywordArg, + ListOf, + MultiOutputPattern, + PatternExpr, + RepeatedExpr, + _TargetArgsExpr, + _TargetExpr, + _TargetExprVarArgs, +) +rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False) +gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2) +permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored()) +expand_default = CallFunction(aten.expand.default, permute_default, Ignored()) +clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format) +view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2) +permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored()) +permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored()) +expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored()) +clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format) +view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2) +bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1) +view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored()) +div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored(), _users=2) +amax_default = CallFunction(aten.amax.default, div_Tensor, Ignored(), True) +sub_Tensor = CallFunction(aten.sub.Tensor, div_Tensor, amax_default) +exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2) +sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True) +div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2) +mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1) +mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored()) +convert_element_type_default = CallFunction(prims.convert_element_type.default, mul_Tensor_1, Ignored()) +expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default, Ignored()) +view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2) +permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored()) +expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored()) +clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format) +view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2) +bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4) +view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored()) +view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2) +permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored()) +bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4) +convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, bmm_default_2, Ignored()) +view_default_7 = CallFunction(aten.view.default, convert_element_type_default_1, Ignored()) +convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored()) +convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored()) +mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored()) +mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, mul_Tensor_2) +clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format) +alias_default = CallFunction(aten.alias.default, div_Tensor_1) +alias_default_1 = CallFunction(aten.alias.default, alias_default) +alias_default_2 = CallFunction(aten.alias.default, alias_default_1) +alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2) +mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default_3, alias_default_3, _users=2) +sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True) +mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1) +sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5) +div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, Ignored()) +view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2) +permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored()) +bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5) +view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored()) +permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored()) +permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored()) +bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8) +view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored()) +permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored()) +permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored()) +permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored()) +bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6) +view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored()) +permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored()) +_sfdp_pattern_7_training = MultiOutputPattern([view_default_5, + permute_default_6, + permute_default_9, + permute_default_11, + None +]) + + +permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored()) +expand_default = CallFunction(aten.expand.default, permute_default, Ignored()) +clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format) +view_default = CallFunction(aten.view.default, clone_default, Ignored()) +permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored()) +permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored()) +expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored()) +clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format) +view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored()) +bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1) +view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored()) +div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored(), _users=2) +amax_default = CallFunction(aten.amax.default, div_Tensor, Ignored(), True) +sub_Tensor = CallFunction(aten.sub.Tensor, div_Tensor, amax_default) +exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2) +sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True) +div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList) +clone_default_2 = CallFunction(aten.clone.default, div_Tensor_1) +convert_element_type_default = CallFunction(prims.convert_element_type.default, clone_default_2, Ignored()) +expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default, Ignored()) +view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored()) +permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored()) +expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored()) +clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format) +view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored()) +bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4) +_sfdp_pattern_7_inference = CallFunction(aten.view.default, bmm_default_1, Ignored()) + + +rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False) +gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2) +permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored()) +expand_default = CallFunction(aten.expand.default, permute_default, Ignored()) +clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format) +view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2) +permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored()) +permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored()) +expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored()) +clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format) +view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2) +bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1) +view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored()) +div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored()) +convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored(), _users=2) +amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True) +sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default) +exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2) +sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True) +div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2) +mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1) +mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored()) +convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, mul_Tensor_1, Ignored()) +expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored()) +view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2) +permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored()) +expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored()) +clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format) +view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2) +bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4) +view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored()) +view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2) +permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored()) +bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4) +view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored()) +convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored()) +convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored()) +mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored()) +mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, mul_Tensor_2) +clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format) +alias_default = CallFunction(aten.alias.default, div_Tensor_1) +alias_default_1 = CallFunction(aten.alias.default, alias_default) +alias_default_2 = CallFunction(aten.alias.default, alias_default_1) +alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2) +mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default_3, alias_default_3, _users=2) +sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True) +mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1) +sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5) +convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored()) +div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_4, Ignored()) +view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2) +permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored()) +bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5) +view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored()) +permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored()) +permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored()) +bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8) +view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored()) +permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored()) +permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored()) +permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored()) +bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6) +view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored()) +permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored()) +_sfdp_pattern_7_half_training = MultiOutputPattern([view_default_5, + permute_default_6, + permute_default_9, + permute_default_11, + None +]) + + +permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored()) +expand_default = CallFunction(aten.expand.default, permute_default, Ignored()) +clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format) +view_default = CallFunction(aten.view.default, clone_default, Ignored()) +permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored()) +permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored()) +expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored()) +clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format) +view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored()) +bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1) +view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored()) +div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored()) +convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored(), _users=2) +amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True) +sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default) +exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2) +sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True) +div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList) +clone_default_2 = CallFunction(aten.clone.default, div_Tensor_1) +convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, clone_default_2, Ignored()) +expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored()) +view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored()) +permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored()) +expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored()) +clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format) +view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored()) +bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4) +_sfdp_pattern_7_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored()) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b17d76e12794b8407063c84d5dbb55b3aac25c99 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__init__.py @@ -0,0 +1 @@ +from . import mm, mm_common, mm_plus_mm, unpack_mixed_mm diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c5b3ededacc7beb426c70d4a4f3c4e15eb7cfd9 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_plus_mm.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_plus_mm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..209463713e49f2d11d9d5fc4c7705c0bb3353900 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_plus_mm.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/unpack_mixed_mm.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/unpack_mixed_mm.py new file mode 100644 index 0000000000000000000000000000000000000000..d9fcd063584640bd04d824231099b85db72f15e1 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/unpack_mixed_mm.py @@ -0,0 +1,82 @@ +import logging +from typing import List + +from ..select_algorithm import autotune_select_algorithm, ChoiceCaller, TritonTemplate +from .mm_common import mm_args, mm_configs, mm_grid, mm_options + +log = logging.getLogger(__name__) + +uint4x2_mixed_mm_template = TritonTemplate( + name="uint4x2_mixed_mm", + grid=mm_grid, + source=r""" +{{def_kernel("A", "B")}} + M = {{size("A", 0)}} + N = {{size("B", 1)}} + K = {{size("A", 1)}} + stride_am = {{stride("A", 0)}} + stride_ak = {{stride("A", 1)}} + stride_bk = {{stride("B", 0)}} + stride_bn = {{stride("B", 1)}} + + # based on triton.ops.matmul + pid = tl.program_id(0) + grid_m = (M + BLOCK_M - 1) // BLOCK_M + grid_n = (N + BLOCK_N - 1) // BLOCK_N + + # re-order program ID for better L2 performance + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // (group_size) + + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + rk = tl.arange(0, BLOCK_K) + A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) + B = B + (rk[:, None]//2 * stride_bk + rbn[None, :] * stride_bn) + b_shifts = 4*(rk%2) + b_subs = 8*(1-(rk%2)) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) + for k in range(K, 0, -BLOCK_K): + if EVEN_K: + a = tl.load(A) + b = tl.load(B) + else: + a = tl.load(A, mask=rk[None, :] < k, other=0.) + b = tl.load(B, mask=rk[:, None] < k, other=0.) + b = ((b >> b_shifts[:, None]) & 0xF) - 8 + b = b.to(B_PROLOGUE_CAST_TYPE) + acc += tl.dot(a, b, allow_tf32=ALLOW_TF32) + A += BLOCK_K * stride_ak + B += BLOCK_K//2 * stride_bk + + # rematerialize rm and rn to save registers + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + idx_m = rm[:, None] + idx_n = rn[None, :] + mask = (idx_m < M) & (idx_n < N) + + # inductor generates a suffix + {{store_output(("idx_m", "idx_n"), "acc", "mask")}} +""", +) + + +def tuned_uint4x2_mixed_mm(mat1, mat2, mat2_mm_shape, mat2_dtype): + m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=None, use_4x2_dim=True) + choices: List[ChoiceCaller] = [] + b_prologue_cast_type = f"tl.{mat2_dtype}".replace("torch.", "") + for config in mm_configs(m, n, k): + uint4x2_mixed_mm_template.maybe_append_choice( + choices, + input_nodes=(mat1, mat2), + layout=layout, + **mm_options(config, m, n, k, layout, b_prologue_cast_type), + ) + return autotune_select_algorithm("uint4x2_mixed_mm", choices, [mat1, mat2], layout) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b042126d99e217b789853d4a52b871918f9162f6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__init__.py @@ -0,0 +1,1412 @@ +r""" +This package adds support for CUDA tensor types. + +It implements the same function as CPU tensors, but they utilize +GPUs for computation. + +It is lazily initialized, so you can always import it, and use +:func:`is_available()` to determine if your system supports CUDA. + +:ref:`cuda-semantics` has more details about working with CUDA. +""" + + +import contextlib +import importlib +import os +import sys +import threading +import traceback +import warnings +from functools import lru_cache +from typing import Any, Callable, cast, List, Optional, Tuple, Union + +import torch +import torch._C +from torch.types import Device +from .. import device as _device +from .._utils import _dummy_type, _LazySeedTracker, classproperty +from ._utils import _get_device_index +from .graphs import ( + CUDAGraph, + graph, + graph_pool_handle, + is_current_stream_capturing, + make_graphed_callables, +) +from .streams import Event, ExternalStream, Stream + +try: + from torch._C import _cudart # type: ignore[attr-defined] +except ImportError: + _cudart = None + +_initialized = False +_tls = threading.local() +_initialization_lock = threading.Lock() +_queued_calls: List[ + Tuple[Callable[[], None], List[str]] +] = [] # don't invoke these until initialization occurs +_is_in_bad_fork = getattr(torch._C, "_cuda_isInBadFork", lambda: False) +_device_t = Union[_device, str, int, None] + +_HAS_PYNVML = False +_PYNVML_ERR = None +try: + import pynvml # type: ignore[import] + + _HAS_PYNVML = True +except ImportError as err: + _PYNVML_ERR = err # sometimes a lib is installed but the import fails for some other reason, so we log the error for later + +_lazy_seed_tracker = _LazySeedTracker() + +# Define dummy _CudaDeviceProperties type if PyTorch was compiled without CUDA +if hasattr(torch._C, "_CudaDeviceProperties"): + _CudaDeviceProperties = torch._C._CudaDeviceProperties +else: + _CudaDeviceProperties = _dummy_type("_CudaDeviceProperties") # type: ignore[assignment, misc] + +if hasattr(torch._C, "_cuda_exchangeDevice"): + _exchange_device = torch._C._cuda_exchangeDevice +else: + + def _exchange_device(device: int) -> int: + if device < 0: + return -1 + raise RuntimeError("PyTorch was compiled without CUDA support") + + +if hasattr(torch._C, "_cuda_maybeExchangeDevice"): + _maybe_exchange_device = torch._C._cuda_maybeExchangeDevice +else: + + def _maybe_exchange_device(device: int) -> int: + if device < 0: + return -1 + raise RuntimeError("PyTorch was compiled without CUDA support") + + +has_half: bool = True +has_magma: bool = torch._C._has_magma + +default_generators: Tuple[torch._C.Generator] = () # type: ignore[assignment] + + +def _is_compiled() -> bool: + r"""Return true if compile with CUDA support.""" + return hasattr(torch._C, "_cuda_getDeviceCount") + + +def _nvml_based_avail() -> bool: + return os.getenv("PYTORCH_NVML_BASED_CUDA_CHECK") == "1" + + +def is_available() -> bool: + r"""Return a bool indicating if CUDA is currently available.""" + if not _is_compiled(): + return False + if _nvml_based_avail(): + # The user has set an env variable to request this availability check that attempts to avoid fork poisoning by + # using NVML at the cost of a weaker CUDA availability assessment. Note that if NVML discovery/initialization + # fails, this assessment falls back to the default CUDA Runtime API assessment (`cudaGetDeviceCount`) + return device_count() > 0 + else: + # The default availability inspection never throws and returns 0 if the driver is missing or can't + # be initialized. This uses the CUDA Runtime API `cudaGetDeviceCount` which in turn initializes the CUDA Driver + # API via `cuInit` + return torch._C._cuda_getDeviceCount() > 0 + + +def is_bf16_supported(): + r"""Return a bool indicating if the current CUDA/ROCm device supports dtype bfloat16.""" + # Check for ROCm, if true return true, no ROCM_VERSION check required, + # since it is supported on AMD GPU archs. + if torch.version.hip: + return True + + device = torch.cuda.current_device() + + # Check for CUDA version and device compute capability. + # This is a fast way to check for it. + cuda_version = torch.version.cuda + if ( + cuda_version is not None + and int(cuda_version.split(".")[0]) >= 11 + and torch.cuda.get_device_properties(device).major >= 8 + ): + return True + + # Finally try to create a bfloat16 device. + return _check_bf16_tensor_supported(device) + + +@lru_cache(maxsize=16) +def _check_bf16_tensor_supported(device: _device_t): + try: + torch.tensor([1.0], dtype=torch.bfloat16, device=device) + return True + except Exception: + return False + + +def _sleep(cycles): + torch._C._cuda_sleep(cycles) + + +def _check_capability(): + incorrect_binary_warn = """ + Found GPU%d %s which requires CUDA_VERSION >= %d to + work properly, but your PyTorch was compiled + with CUDA_VERSION %d. Please install the correct PyTorch binary + using instructions from https://pytorch.org + """ + + old_gpu_warn = """ + Found GPU%d %s which is of cuda capability %d.%d. + PyTorch no longer supports this GPU because it is too old. + The minimum cuda capability supported by this library is %d.%d. + """ + + if torch.version.cuda is not None: # on ROCm we don't want this check + CUDA_VERSION = torch._C._cuda_getCompiledVersion() + for d in range(device_count()): + capability = get_device_capability(d) + major = capability[0] + minor = capability[1] + name = get_device_name(d) + current_arch = major * 10 + minor + min_arch = min( + (int(arch.split("_")[1]) for arch in torch.cuda.get_arch_list()), + default=35, + ) + if current_arch < min_arch: + warnings.warn( + old_gpu_warn + % (d, name, major, minor, min_arch // 10, min_arch % 10) + ) + + +def _check_cubins(): + incompatible_device_warn = """ +{} with CUDA capability sm_{} is not compatible with the current PyTorch installation. +The current PyTorch install supports CUDA capabilities {}. +If you want to use the {} GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/ +""" + if torch.version.cuda is None: # on ROCm we don't want this check + return + arch_list = get_arch_list() + if len(arch_list) == 0: + return + supported_sm = [int(arch.split("_")[1]) for arch in arch_list if "sm_" in arch] + for idx in range(device_count()): + cap_major, cap_minor = get_device_capability(idx) + # NVIDIA GPU compute architectures are backward compatible within major version + supported = any(sm // 10 == cap_major for sm in supported_sm) + if not supported: + device_name = get_device_name(idx) + capability = cap_major * 10 + cap_minor + warnings.warn( + incompatible_device_warn.format( + device_name, capability, " ".join(arch_list), device_name + ) + ) + + +def is_initialized(): + r"""Return whether PyTorch's CUDA state has been initialized.""" + return _initialized and not _is_in_bad_fork() + + +def _lazy_call(callable, **kwargs): + if is_initialized(): + callable() + else: + # TODO(torch_deploy): this accesses linecache, which attempts to read the + # file system to get traceback info. Patch linecache or do something + # else here if this ends up being important. + global _lazy_seed_tracker + if kwargs.get("seed_all", False): + _lazy_seed_tracker.queue_seed_all(callable, traceback.format_stack()) + elif kwargs.get("seed", False): + _lazy_seed_tracker.queue_seed(callable, traceback.format_stack()) + else: + # Don't store the actual traceback to avoid memory cycle + _queued_calls.append((callable, traceback.format_stack())) + + +_lazy_call(_check_capability) +_lazy_call(_check_cubins) + + +class DeferredCudaCallError(Exception): + pass + + +OutOfMemoryError = torch._C._OutOfMemoryError + + +def init(): + r"""Initialize PyTorch's CUDA state. + + You may need to call this explicitly if you are interacting with + PyTorch via its C API, as Python bindings for CUDA functionality + will not be available until this initialization takes place. + Ordinary users should not need this, as all of PyTorch's CUDA methods + automatically initialize CUDA state on-demand. + + Does nothing if the CUDA state is already initialized. + """ + _lazy_init() + + +def _lazy_init(): + global _initialized, _queued_calls + if is_initialized() or hasattr(_tls, "is_initializing"): + return + with _initialization_lock: + # We be double-checked locking, boys! This is OK because + # the above test was GIL protected anyway. The inner test + # is for when a thread blocked on some other thread which was + # doing the initialization; when they get the lock, they will + # find there is nothing left to do. + if is_initialized(): + return + # It is important to prevent other threads from entering _lazy_init + # immediately, while we are still guaranteed to have the GIL, because some + # of the C calls we make below will release the GIL + if _is_in_bad_fork(): + raise RuntimeError( + "Cannot re-initialize CUDA in forked subprocess. To use CUDA with " + "multiprocessing, you must use the 'spawn' start method" + ) + if not hasattr(torch._C, "_cuda_getDeviceCount"): + raise AssertionError("Torch not compiled with CUDA enabled") + if _cudart is None: + raise AssertionError( + "libcudart functions unavailable. It looks like you have a broken build?" + ) + # This function throws if there's a driver initialization error, no GPUs + # are found or any other error occurs + if "CUDA_MODULE_LOADING" not in os.environ: + os.environ["CUDA_MODULE_LOADING"] = "LAZY" + torch._C._cuda_init() + # Some of the queued calls may reentrantly call _lazy_init(); + # we need to just return without initializing in that case. + # However, we must not let any *other* threads in! + _tls.is_initializing = True + + for calls in _lazy_seed_tracker.get_calls(): + if calls: + _queued_calls.append(calls) + + try: + for queued_call, orig_traceback in _queued_calls: + try: + queued_call() + except Exception as e: + msg = ( + f"CUDA call failed lazily at initialization with error: {str(e)}\n\n" + f"CUDA call was originally invoked at:\n\n{''.join(orig_traceback)}" + ) + raise DeferredCudaCallError(msg) from e + finally: + delattr(_tls, "is_initializing") + _initialized = True + + +def cudart(): + _lazy_init() + return _cudart + + +class cudaStatus: + SUCCESS: int = 0 + ERROR_NOT_READY: int = 34 + + +class CudaError(RuntimeError): + def __init__(self, code: int) -> None: + msg = _cudart.cudaGetErrorString(_cudart.cudaError(code)) + super().__init__(f"{msg} ({code})") + + +def check_error(res: int) -> None: + if res != _cudart.cudaError.success: + raise CudaError(res) + + +class _DeviceGuard: + def __init__(self, index: int): + self.idx = index + self.prev_idx = -1 + + def __enter__(self): + self.prev_idx = torch.cuda._exchange_device(self.idx) + + def __exit__(self, type: Any, value: Any, traceback: Any): + self.idx = torch.cuda._maybe_exchange_device(self.prev_idx) + return False + + +class device: + r"""Context-manager that changes the selected device. + + Args: + device (torch.device or int): device index to select. It's a no-op if + this argument is a negative integer or ``None``. + """ + + def __init__(self, device: Any): + self.idx = _get_device_index(device, optional=True) + self.prev_idx = -1 + + def __enter__(self): + self.prev_idx = torch.cuda._exchange_device(self.idx) + + def __exit__(self, type: Any, value: Any, traceback: Any): + self.idx = torch.cuda._maybe_exchange_device(self.prev_idx) + return False + + +class device_of(device): + r"""Context-manager that changes the current device to that of given object. + + You can use both tensors and storages as arguments. If a given object is + not allocated on a GPU, this is a no-op. + + Args: + obj (Tensor or Storage): object allocated on the selected device. + """ + + def __init__(self, obj): + idx = obj.get_device() if obj.is_cuda else -1 + super().__init__(idx) + + +def set_device(device: _device_t) -> None: + r"""Set the current device. + + Usage of this function is discouraged in favor of :any:`device`. In most + cases it's better to use ``CUDA_VISIBLE_DEVICES`` environmental variable. + + Args: + device (torch.device or int): selected device. This function is a no-op + if this argument is negative. + """ + device = _get_device_index(device) + if device >= 0: + torch._C._cuda_setDevice(device) + + +def get_device_name(device: Optional[_device_t] = None) -> str: + r"""Get the name of a device. + + Args: + device (torch.device or int, optional): device for which to return the + name. This function is a no-op if this argument is a negative + integer. It uses the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + Returns: + str: the name of the device + """ + return get_device_properties(device).name + + +def get_device_capability(device: Optional[_device_t] = None) -> Tuple[int, int]: + r"""Get the cuda capability of a device. + + Args: + device (torch.device or int, optional): device for which to return the + device capability. This function is a no-op if this argument is + a negative integer. It uses the current device, given by + :func:`~torch.cuda.current_device`, if :attr:`device` is ``None`` + (default). + + Returns: + tuple(int, int): the major and minor cuda capability of the device + """ + prop = get_device_properties(device) + return prop.major, prop.minor + + +def get_device_properties(device: _device_t) -> _CudaDeviceProperties: + r"""Get the properties of a device. + + Args: + device (torch.device or int or str): device for which to return the + properties of the device. + + Returns: + _CudaDeviceProperties: the properties of the device + """ + _lazy_init() # will define _get_device_properties + device = _get_device_index(device, optional=True) + if device < 0 or device >= device_count(): + raise AssertionError("Invalid device id") + return _get_device_properties(device) # type: ignore[name-defined] + + +def can_device_access_peer(device: _device_t, peer_device: _device_t) -> bool: + r"""Check if peer access between two devices is possible.""" + _lazy_init() + device = _get_device_index(device, optional=True) + peer_device = _get_device_index(peer_device) + if device < 0 or device >= device_count(): + raise AssertionError("Invalid device id") + if peer_device < 0 or peer_device >= device_count(): + raise AssertionError("Invalid peer device id") + return torch._C._cuda_canDeviceAccessPeer(device, peer_device) + + +class StreamContext: + r"""Context-manager that selects a given stream. + + All CUDA kernels queued within its context will be enqueued on a selected + stream. + + Args: + Stream (Stream): selected stream. This manager is a no-op if it's + ``None``. + .. note:: Streams are per-device. + """ + cur_stream: Optional["torch.cuda.Stream"] + + def __init__(self, stream: Optional["torch.cuda.Stream"]): + self.stream = stream + self.idx = _get_device_index(None, True) + if not torch.jit.is_scripting(): + if self.idx is None: + self.idx = -1 + + self.src_prev_stream = ( + None if not torch.jit.is_scripting() else torch.cuda.default_stream(None) + ) + self.dst_prev_stream = ( + None if not torch.jit.is_scripting() else torch.cuda.default_stream(None) + ) + + def __enter__(self): + # Local cur_stream variable for type refinement + cur_stream = self.stream + # Return if stream is None or CUDA device not available + if cur_stream is None or self.idx == -1: + return + self.src_prev_stream = torch.cuda.current_stream(None) + + # If the stream is not on the current device, then + # set the current stream on the device + if self.src_prev_stream.device != cur_stream.device: + with device(cur_stream.device): + self.dst_prev_stream = torch.cuda.current_stream(cur_stream.device) + torch.cuda.set_stream(cur_stream) + + def __exit__(self, type: Any, value: Any, traceback: Any): + # Local cur_stream variable for type refinement + cur_stream = self.stream + # If stream is None or no CUDA device available, return + if cur_stream is None or self.idx == -1: + return + + # Reset the stream on the original device + # and destination device + if self.src_prev_stream.device != cur_stream.device: # type: ignore[union-attr] + torch.cuda.set_stream(self.dst_prev_stream) # type: ignore[arg-type] + torch.cuda.set_stream(self.src_prev_stream) # type: ignore[arg-type] + + +def stream(stream: Optional["torch.cuda.Stream"]) -> StreamContext: + r"""Wrap around the Context-manager StreamContext that selects a given stream. + + Arguments: + stream (Stream): selected stream. This manager is a no-op if it's + ``None``. + ..Note:: In eager mode stream is of type Stream class while in JIT it is + an object of the custom class ``torch.classes.cuda.Stream``. + """ + return StreamContext(stream) + + +def _set_stream_by_id(stream_id, device_index, device_type): + r"""set stream specified by the stream id, device index and + device type + + Args: stream_id (int): stream id in stream pool + device_index (int): device index in topo + device_type (int): enum device type + """ + torch._C._cuda_setStream( + stream_id=stream_id, + device_index=device_index, + device_type=device_type, + ) + + +def set_stream(stream: Stream): + r"""Set the current stream.This is a wrapper API to set the stream. + Usage of this function is discouraged in favor of the ``stream`` + context manager. + + Args: + stream (Stream): selected stream. This function is a no-op + if this argument is ``None``. + """ + if stream is None: + return + _set_stream_by_id( + stream_id=stream.stream_id, + device_index=stream.device_index, + device_type=stream.device_type, + ) + + +def _parse_visible_devices() -> Union[List[int], List[str]]: + r"""Parse CUDA_VISIBLE_DEVICES environment variable.""" + var = os.getenv("CUDA_VISIBLE_DEVICES") + if var is None: + return list(range(64)) + + def _strtoul(s: str) -> int: + """Return -1 or positive integer sequence string starts with.""" + if not s: + return -1 + for idx, c in enumerate(s): + if not (c.isdigit() or (idx == 0 and c in "+-")): + break + if idx + 1 == len(s): + idx += 1 + return int(s[:idx]) if idx > 0 else -1 + + def parse_list_with_prefix(lst: str, prefix: str) -> List[str]: + rcs: List[str] = [] + for elem in lst.split(","): + # Repeated id results in empty set + if elem in rcs: + return cast(List[str], []) + # Anything other but prefix is ignored + if not elem.startswith(prefix): + break + rcs.append(elem) + return rcs + + if var.startswith("GPU-"): + return parse_list_with_prefix(var, "GPU-") + if var.startswith("MIG-"): + return parse_list_with_prefix(var, "MIG-") + # CUDA_VISIBLE_DEVICES uses something like strtoul + # which makes `1gpu2,2ampere` is equivalent to `1,2` + rc: List[int] = [] + for elem in var.split(","): + x = _strtoul(elem.strip()) + # Repeated ordinal results in empty set + if x in rc: + return cast(List[int], []) + # Negative value aborts the sequence + if x < 0: + break + rc.append(x) + return rc + + +def _raw_device_count_nvml() -> int: + r"""Return number of devices as reported by NVML or negative value if NVML discovery/initialization failed.""" + from ctypes import byref, c_int, CDLL + + nvml_h = CDLL("libnvidia-ml.so.1") + rc = nvml_h.nvmlInit() + if rc != 0: + warnings.warn("Can't initialize NVML") + return -1 + dev_count = c_int(-1) + rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count)) + if rc != 0: + warnings.warn("Can't get nvml device count") + return -1 + del nvml_h + return dev_count.value + + +def _raw_device_uuid_nvml() -> Optional[List[str]]: + r"""Return list of device UUID as reported by NVML or None if NVM discovery/initialization failed.""" + from ctypes import byref, c_int, c_void_p, CDLL, create_string_buffer + + nvml_h = CDLL("libnvidia-ml.so.1") + rc = nvml_h.nvmlInit() + if rc != 0: + warnings.warn("Can't initialize NVML") + return None + dev_count = c_int(-1) + rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count)) + if rc != 0: + warnings.warn("Can't get nvml device count") + return None + uuids: List[str] = [] + for idx in range(dev_count.value): + dev_id = c_void_p() + rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id)) + if rc != 0: + warnings.warn("Can't get device handle") + return None + buf_len = 96 + buf = create_string_buffer(buf_len) + rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len) + if rc != 0: + warnings.warn("Can't get device UUID") + return None + uuids.append(buf.raw.decode("ascii").strip("\0")) + del nvml_h + return uuids + + +def _transform_uuid_to_ordinals(candidates: List[str], uuids: List[str]) -> List[int]: + r"""Given the set of partial uuids and list of known uuids builds a set of ordinals excluding ambiguous partials IDs.""" + + def uuid_to_orinal(candidate: str, uuids: List[str]) -> int: + best_match = -1 + for idx, uuid in enumerate(uuids): + if not uuid.startswith(candidate): + continue + # Ambiguous candidate + if best_match != -1: + return -1 + best_match = idx + return best_match + + rc: List[int] = [] + for candidate in candidates: + idx = uuid_to_orinal(candidate, uuids) + # First invalid ordinal stops parsing + if idx < 0: + break + # Duplicates result in empty set + if idx in rc: + return cast(List[int], []) + rc.append(idx) + return rc + + +def _device_count_nvml() -> int: + r"""Return number of devices as reported by NVML taking CUDA_VISIBLE_DEVICES into account. + + Negative value is returned if NVML discovery or initialization has failed. + """ + visible_devices = _parse_visible_devices() + if not visible_devices: + return 0 + try: + if type(visible_devices[0]) is str: + # Skip MIG parsing + if visible_devices[0].startswith("MIG-"): + return -1 + uuids = _raw_device_uuid_nvml() + if uuids is None: + return -1 + visible_devices = _transform_uuid_to_ordinals( + cast(List[str], visible_devices), uuids + ) + else: + raw_cnt = _raw_device_count_nvml() + if raw_cnt <= 0: + return raw_cnt + # Trim the list up to a maximum available device + for idx, val in enumerate(visible_devices): + if cast(int, val) >= raw_cnt: + return idx + except OSError: + return -1 + except AttributeError: + return -1 + return len(visible_devices) + + +def _get_nvml_device_index(device: Optional[Union[int, Device]]) -> int: + r"""Return the NVML index of the device, taking CUDA_VISIBLE_DEVICES into account.""" + idx = _get_device_index(device, optional=True) + visible_devices = _parse_visible_devices() + if type(visible_devices[0]) is str: + uuids = _raw_device_uuid_nvml() + if uuids is None: + raise RuntimeError("Can't get device UUIDs") + visible_devices = _transform_uuid_to_ordinals( + cast(List[str], visible_devices), uuids + ) + visible_devices = cast(List[int], visible_devices) + if idx < 0 or idx >= len(visible_devices): + raise RuntimeError( + f"device {idx} is not visible (CUDA_VISIBLE_DEVICES={visible_devices})" + ) + return visible_devices[idx] + + +@lru_cache(maxsize=1) +def device_count() -> int: + r"""Return the number of GPUs available.""" + if not _is_compiled(): + return 0 + # bypass _device_count_nvml() if rocm (not supported) + nvml_count = -1 if torch.version.hip else _device_count_nvml() + return torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count + + +def get_arch_list() -> List[str]: + r"""Return list CUDA architectures this library was compiled for.""" + if not is_available(): + return [] + arch_flags = torch._C._cuda_getArchFlags() + if arch_flags is None: + return [] + return arch_flags.split() + + +def get_gencode_flags() -> str: + r"""Return NVCC gencode flags this library was compiled with.""" + arch_list = get_arch_list() + if len(arch_list) == 0: + return "" + arch_list_ = [arch.split("_") for arch in arch_list] + return " ".join( + [ + f"-gencode compute=compute_{arch},code={kind}_{arch}" + for (kind, arch) in arch_list_ + ] + ) + + +def current_device() -> int: + r"""Return the index of a currently selected device.""" + _lazy_init() + return torch._C._cuda_getDevice() + + +def synchronize(device: _device_t = None) -> None: + r"""Wait for all kernels in all streams on a CUDA device to complete. + + Args: + device (torch.device or int, optional): device for which to synchronize. + It uses the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + _lazy_init() + with torch.cuda.device(device): + return torch._C._cuda_synchronize() + + +def ipc_collect(): + r"""Force collects GPU memory after it has been released by CUDA IPC. + + .. note:: + Checks if any sent CUDA tensors could be cleaned from the memory. Force + closes shared memory file used for reference counting if there is no + active counters. Useful when the producer process stopped actively sending + tensors and want to release unused memory. + """ + _lazy_init() + return torch._C._cuda_ipc_collect() + + +def current_stream(device: Optional[_device_t] = None) -> Stream: + r"""Return the currently selected :class:`Stream` for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + the currently selected :class:`Stream` for the current device, given + by :func:`~torch.cuda.current_device`, if :attr:`device` is ``None`` + (default). + """ + _lazy_init() + streamdata = torch._C._cuda_getCurrentStream( + _get_device_index(device, optional=True) + ) + return Stream( + stream_id=streamdata[0], device_index=streamdata[1], device_type=streamdata[2] + ) + + +def default_stream(device: Optional[_device_t] = None) -> Stream: + r"""Return the default :class:`Stream` for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + the default :class:`Stream` for the current device, given by + :func:`~torch.cuda.current_device`, if :attr:`device` is ``None`` + (default). + """ + _lazy_init() + streamdata = torch._C._cuda_getDefaultStream( + _get_device_index(device, optional=True) + ) + return Stream( + stream_id=streamdata[0], device_index=streamdata[1], device_type=streamdata[2] + ) + + +def current_blas_handle(): + r"""Return cublasHandle_t pointer to current cuBLAS handle""" + _lazy_init() + return torch._C._cuda_getCurrentBlasHandle() + + +def set_sync_debug_mode(debug_mode: Union[int, str]) -> None: + r"""Set the debug mode for cuda synchronizing operations. + + Args: + debug_mode(str or int): if "default" or 0, don't error or warn on synchronizing operations, + if "warn" or 1, warn on synchronizing operations, if "error" or 2, error out synchronizing operations. + + Warning: + This is an experimental feature, and not all synchronizing operations will trigger warning or error. In + particular, operations in torch.distributed and torch.sparse namespaces are not covered yet. + """ + _lazy_init() + if isinstance(debug_mode, str): + if debug_mode == "default": + debug_mode = 0 + elif debug_mode == "warn": + debug_mode = 1 + elif debug_mode == "error": + debug_mode = 2 + else: + raise RuntimeError( + "invalid value of debug_mode, expected one of `default`, `warn`, `error`" + ) + + torch._C._cuda_set_sync_debug_mode(debug_mode) + + +def get_sync_debug_mode() -> int: + r"""Return current value of debug mode for cuda synchronizing operations.""" + _lazy_init() + return torch._C._cuda_get_sync_debug_mode() + + +def _get_pynvml_handler(device: Optional[Union[Device, int]] = None): + if not _HAS_PYNVML: + raise ModuleNotFoundError( + "pynvml does not seem to be installed or it can't be imported." + ) from _PYNVML_ERR + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded as e: + raise RuntimeError("cuda driver can't be loaded, is cuda enabled?") from e + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + return handle + + +def memory_usage(device: Optional[Union[Device, int]] = None) -> int: + r"""Return the percent of time over the past sample period during which global (device) + memory was being read or written as given by `nvidia-smi`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + Warning: Each sample period may be between 1 second and 1/6 second, + depending on the product being queried. + """ + handle = _get_pynvml_handler() + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + return pynvml.nvmlDeviceGetUtilizationRates(handle).memory + + +def utilization(device: Optional[Union[Device, int]] = None) -> int: + r"""Return the percent of time over the past sample period during which one or + more kernels was executing on the GPU as given by `nvidia-smi`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + Warning: Each sample period may be between 1 second and 1/6 second, + depending on the product being queried. + """ + handle = _get_pynvml_handler(device) + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + return pynvml.nvmlDeviceGetUtilizationRates(handle).gpu + + +def temperature(device: Optional[Union[Device, int]] = None) -> int: + r"""Return the average temperature of the GPU sensor in Degrees C (Centigrades). + + The average temperature is computed based on past sample period as given by `nvidia-smi`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + Warning: Each sample period may be between 1 second and 1/6 second, + depending on the product being queried. + """ + handle = _get_pynvml_handler(device) + # 0 refers to the temperature sensor for the GPU die. + return pynvml.nvmlDeviceGetTemperature(handle, 0) + + +def power_draw(device: Optional[Union[Device, int]] = None) -> int: + r"""Return the average power draw of the GPU sensor in mW (MilliWatts) + over the past sample period as given by `nvidia-smi` for Fermi or newer fully supported devices. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + Warning: Each sample period may be between 1 second and 1/6 second, + depending on the product being queried. + """ + handle = _get_pynvml_handler(device) + return pynvml.nvmlDeviceGetPowerUsage(handle) + + +def clock_rate(device: Optional[Union[Device, int]] = None) -> int: + r"""Return the clock speed of the GPU SM in Hz Hertz over the past sample period as given by `nvidia-smi`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + Warning: Each sample period may be between 1 second and 1/6 second, + depending on the product being queried. + """ + handle = _get_pynvml_handler(device) + return pynvml.nvmlDeviceGetClockInfo(handle, 1) + + +def _get_device(device: Union[int, str, torch.device]) -> torch.device: + r"""Return the torch.device type object from the passed in device. + + Args: + device (torch.device or int): selected device. + """ + if isinstance(device, str): + device = torch.device(device) + elif isinstance(device, int): + device = torch.device("cuda", device) + return device + + +def _get_generator(device: torch.device) -> torch._C.Generator: + r"""Return the CUDA Generator object for the given device. + + Args: + device (torch.device): selected device. + """ + idx = device.index + if idx is None: + idx = current_device() + return torch.cuda.default_generators[idx] + + +def _set_rng_state_offset( + offset: int, device: Union[int, str, torch.device] = "cuda" +) -> None: + r"""Set the random number generator state offset of the specified GPU. + + Args: + offset (int): The desired offset + device (torch.device or int, optional): The device to set the RNG state. + Default: ``'cuda'`` (i.e., ``torch.device('cuda')``, the current CUDA device). + """ + final_device = _get_device(device) + + def cb(): + default_generator = _get_generator(final_device) + default_generator.set_offset(offset) + + _lazy_call(cb) + + +def _get_rng_state_offset(device: Union[int, str, torch.device] = "cuda") -> int: + r"""Return the random number generator state offset of the specified GPU. + + Args: + device (torch.device or int, optional): The device to return the RNG state offset of. + Default: ``'cuda'`` (i.e., ``torch.device('cuda')``, the current CUDA device). + + .. warning:: + This function eagerly initializes CUDA. + """ + _lazy_init() + final_device = _get_device(device) + default_generator = _get_generator(final_device) + return default_generator.get_offset() + + +from .memory import * # noqa: F403 + + +from .random import * # noqa: F403 + +################################################################################ +# Define Storage and Tensor classes +################################################################################ + + +@staticmethod # type: ignore[misc] +def _lazy_new(cls, *args, **kwargs): + _lazy_init() + # We may need to call lazy init again if we are a forked child + # del _CudaBase.__new__ + return super(_CudaBase, cls).__new__(cls, *args, **kwargs) + + +class _CudaBase: + is_cuda = True + is_sparse = False + + def type(self, *args, **kwargs): + # We could use a Protocol here to tell mypy that self has `get_device` method + # but it is only available in the typing module on Python >= 3.8 + # or on typing_extensions module on Python >= 3.6 + with device(self.get_device()): # type: ignore[attr-defined] + return super().type(*args, **kwargs) # type: ignore[misc] + + __new__ = _lazy_new + + +from torch.storage import _LegacyStorage, _warn_typed_storage_removal + + +class _CudaLegacyStorage(_LegacyStorage): + @classmethod + def from_buffer(cls, *args, **kwargs): + _warn_typed_storage_removal() + raise RuntimeError("from_buffer: Not available for CUDA storage") + + @classmethod + def _new_with_weak_ptr(cls, *args, **kwargs): + raise RuntimeError("_new_with_weak_ptr: Not available for CUDA storage") + + @classmethod + def _new_shared_filename(cls, manager, obj, size, *, device=None, dtype=None): + raise RuntimeError("_new_shared_filename: Not available for CUDA storage") + + +class ByteStorage(_CudaLegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal() + return self._dtype + + @classproperty + def _dtype(self): + return torch.uint8 + + +class DoubleStorage(_CudaLegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal() + return self._dtype + + @classproperty + def _dtype(self): + return torch.double + + +class FloatStorage(_CudaLegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal() + return self._dtype + + @classproperty + def _dtype(self): + return torch.float + + +class HalfStorage(_CudaLegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal() + return self._dtype + + @classproperty + def _dtype(self): + return torch.half + + +class LongStorage(_CudaLegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal() + return self._dtype + + @classproperty + def _dtype(self): + return torch.long + + +class IntStorage(_CudaLegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal() + return self._dtype + + @classproperty + def _dtype(self): + return torch.int + + +class ShortStorage(_CudaLegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal() + return self._dtype + + @classproperty + def _dtype(self): + return torch.short + + +class CharStorage(_CudaLegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal() + return self._dtype + + @classproperty + def _dtype(self): + return torch.int8 + + +class BoolStorage(_CudaLegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal() + return self._dtype + + @classproperty + def _dtype(self): + return torch.bool + + +class BFloat16Storage(_CudaLegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal() + return self._dtype + + @classproperty + def _dtype(self): + return torch.bfloat16 + + +class ComplexDoubleStorage(_CudaLegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal() + return self._dtype + + @classproperty + def _dtype(self): + return torch.cdouble + + +class ComplexFloatStorage(_CudaLegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal() + return self._dtype + + @classproperty + def _dtype(self): + return torch.cfloat + + +del _LegacyStorage +del _CudaLegacyStorage + +torch._storage_classes.add(DoubleStorage) +torch._storage_classes.add(FloatStorage) +torch._storage_classes.add(LongStorage) +torch._storage_classes.add(IntStorage) +torch._storage_classes.add(ShortStorage) +torch._storage_classes.add(CharStorage) +torch._storage_classes.add(ByteStorage) +torch._storage_classes.add(HalfStorage) +torch._storage_classes.add(BoolStorage) +torch._storage_classes.add(BFloat16Storage) +torch._storage_classes.add(ComplexDoubleStorage) +torch._storage_classes.add(ComplexFloatStorage) + + +class _WrappedTritonKernel: + """Just a simple wrapper to store some metadata for testing purposes.""" + + def __init__(self, kernel): + self.kernel = kernel + self.kernel_invoked = False + + def __call__(self, *args, **kwargs): + res = self.kernel(*args, **kwargs) + self.kernel_invoked = True + return res + + +def _register_triton_kernels(): + if torch._running_with_deploy(): + return + + @_WrappedTritonKernel + def kernel_impl(*args, **kwargs): + from torch.sparse._triton_ops import bsr_dense_mm + + return bsr_dense_mm(*args, skip_checks=True, **kwargs) + + @_WrappedTritonKernel + def addmm_kernel_impl(*args, **kwargs): + from torch.sparse._triton_ops import bsr_dense_addmm + + return bsr_dense_addmm(*args, skip_checks=True, **kwargs) + + has_triton = importlib.util.find_spec("triton") is not None + if has_triton: + torch._TritonLibrary.registerOp( + "_triton_bsr_dense_mm_out", + "_triton_bsr_dense_mm_out(Tensor bsr, Tensor dense, *, Tensor(a!) out) -> Tensor(a!)", + kernel_impl, + "SparseCsrCUDA", + ) + + torch._TritonLibrary.registerOp( + "_triton_bsr_dense_addmm_out", + ( + "_triton_bsr_dense_addmm_out(Tensor input, Tensor bsr, Tensor dense," + " *, Scalar beta, Scalar alpha, Tensor(a!) out) -> Tensor(a!)" + ), + addmm_kernel_impl, + "SparseCsrCUDA", + ) + + +_lazy_call(_register_triton_kernels) + + +from . import amp, jiterator, nvtx, profiler, sparse + +__all__ = [ + # Typed storage and tensors + "BFloat16Storage", + "BFloat16Tensor", + "BoolStorage", + "BoolTensor", + "ByteStorage", + "ByteTensor", + "CharStorage", + "CharTensor", + "ComplexDoubleStorage", + "ComplexFloatStorage", + "DoubleStorage", + "DoubleTensor", + "FloatStorage", + "FloatTensor", + "HalfStorage", + "HalfTensor", + "IntStorage", + "IntTensor", + "LongStorage", + "LongTensor", + "ShortStorage", + "ShortTensor", + "CUDAGraph", + "CudaError", + "DeferredCudaCallError", + "Event", + "ExternalStream", + "OutOfMemoryError", + "Stream", + "StreamContext", + "amp", + "caching_allocator_alloc", + "caching_allocator_delete", + "can_device_access_peer", + "check_error", + "cudaStatus", + "cudart", + "current_blas_handle", + "current_device", + "current_stream", + "default_generators", + "default_stream", + "device", + "device_count", + "device_of", + "empty_cache", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "get_arch_list", + "get_device_capability", + "get_device_name", + "get_device_properties", + "get_gencode_flags", + "get_rng_state", + "get_rng_state_all", + "get_sync_debug_mode", + "graph", + "graph_pool_handle", + "graphs", + "has_half", + "has_magma", + "init", + "initial_seed", + "ipc_collect", + "is_available", + "is_bf16_supported", + "is_current_stream_capturing", + "is_initialized", + "jiterator", + "list_gpu_processes", + "make_graphed_callables", + "manual_seed", + "manual_seed_all", + "max_memory_allocated", + "max_memory_cached", + "max_memory_reserved", + "mem_get_info", + "memory", + "memory_allocated", + "memory_cached", + "memory_reserved", + "memory_snapshot", + "memory_stats", + "memory_stats_as_nested_dict", + "memory_summary", + "memory_usage", + "temperature", + "power_draw", + "clock_rate", + "nccl", + "nvtx", + "profiler", + "random", + "reset_accumulated_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "reset_peak_memory_stats", + "seed", + "seed_all", + "set_device", + "set_per_process_memory_fraction", + "set_rng_state", + "set_rng_state_all", + "set_stream", + "set_sync_debug_mode", + "sparse", + "stream", + "streams", + "synchronize", + "utilization", +] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/memory.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/memory.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ce5c3b1f96681facd89669c1832accefa7f8884b Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/memory.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/_sanitizer.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/_sanitizer.py new file mode 100644 index 0000000000000000000000000000000000000000..c0b0297366db73b31bd15a5ba7b30d86164b3f31 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/_sanitizer.py @@ -0,0 +1,622 @@ +r""" +This module introduces CUDA Sanitizer, a tool for detecting synchronization errors between kernels ran on different streams. + +It stores information on accesses to tensors to determine if they are synchronized +or not. When enabled in a python program and a possible data race is detected, a +detailed warning will be printed and the program will exit. + +It can be enabled either by importing this module and calling +:func:`enable_cuda_sanitizer()` or by exporting the ``TORCH_CUDA_SANITIZER`` +environment variable. +""" + +import enum +import functools +import inspect +import io +import logging +import sys +import textwrap +import traceback +from dataclasses import dataclass, field +from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, TypeVar + +import torch +import torch.utils._cuda_trace as cuda_trace +from torch.utils import _pytree as pytree +from torch.utils._python_dispatch import TorchDispatchMode + + +DEFAULT_STREAM_ID = 0 + +TK = TypeVar("TK") +TVa = TypeVar("TVa") +TVb = TypeVar("TVb") + +DataPtr = int +StreamId = int +EventId = int +SeqNum = int + +logger = logging.getLogger(__name__) + + +class AccessType(enum.Enum): + READ = enum.auto() + WRITE = enum.auto() + + def __str__(self): + return "reading from" if self is AccessType.READ else "writing to" + + +@dataclass +class Access: + r"""Stores information about a single access to a tensor by a kernel. + + Args: + type: either AccessType.READ or AccessType.Write. + seq_num: the sequential number of the kernel performing the access. + stream: the stream id of the stream executing the kernel. + operator: the schema of the launched kernel, which lists the + arguments and return type. + aliases: the arguments in the schema this access corresponds to. + is_output: Whether the tensor was an output of the kernel. + stack_trace: the stack summary object captured during access. + """ + + type: AccessType + seq_num: SeqNum + stream: StreamId + operator: str + aliases: List[str] + is_output: bool + stack_trace: traceback.StackSummary + + +class SynchronizationError(Exception): + """Base class for errors detected by CUDA Sanitizer.""" + + pass + + +class UnsynchronizedAccessError(SynchronizationError): + """Stores information about two unsynchronized accesses to one data pointer.""" + + def __init__( + self, + data_ptr: DataPtr, + allocation_stack_trace: Optional[traceback.StackSummary], + current_access: Access, + previous_access: Access, + ): + self.data_ptr = data_ptr + self.allocation_stack_trace = allocation_stack_trace + self.current_access = current_access + self.previous_access = previous_access + + def __str__(self): + def format_access(access: Access): + message.write(f"{access.operator}\n{access.type}") + if access.aliases: + message.write(" argument(s) " + ", ".join(access.aliases)) + if access.is_output: + message.write(", and to") + if access.is_output: + message.write(" the output") + message.write( + f"\nWith stack trace:\n{''.join(access.stack_trace.format())}\n" + ) + + with io.StringIO() as message: + message.write( + textwrap.dedent( + f"""\ + ============================ + CSAN detected a possible data race on tensor with data pointer {self.data_ptr} + Access by stream {self.current_access.stream} during kernel: + """ + ) + ) + format_access(self.current_access) + + message.write( + f"Previous access by stream {self.previous_access.stream} during kernel:\n" + ) + format_access(self.previous_access) + + if self.allocation_stack_trace: + message.write( + "Tensor was allocated with stack trace:\n" + f"{''.join(self.allocation_stack_trace.format())}" + ) + else: + message.write("Trace for tensor allocation not found.") + return message.getvalue() + + +class CUDASanitizerErrors(Exception): + """Wrapper class for errors reported by CUDA Sanitizer.""" + + def __init__(self, errors: List[SynchronizationError]): + self.errors = errors + + def __str__(self): + return f"detected {len(self.errors)} errors" + + +@dataclass +class TensorInfo: + r"""Stores information about a single tensor and recent accesses to it. + + Args: + allocation_stack_trace: the stack summary object captured during tensor + allocation. Can be ``None`` if the allocation wasn't caught by CSAN. + reads: list of read accesses to the tensor that were performed since + the last write. + write: the last write access to the tensor. + """ + + allocation_stack_trace: Optional[traceback.StackSummary] + reads: List[Access] = field(default_factory=list) + write: Optional[Access] = None + + +class _TensorsAccessed: + def __init__(self): + self.accesses: Dict[DataPtr, TensorInfo] = {} + + def ensure_tensor_exists(self, data_ptr: DataPtr) -> None: + if data_ptr not in self.accesses: + logger.info( + "Found tensor with pointer: %s, but no matching tensor " + "allocation in the trace. Backfilling the trace now. " + "Perhaps the sanitizer was enabled after some torch operations?", + data_ptr, + ) + self.create_tensor(data_ptr, None) + + def ensure_tensor_does_not_exist(self, data_ptr: DataPtr) -> None: + if data_ptr in self.accesses: + logger.info( + "Found duplicate tensor allocation in the trace for tensor with " + "pointer: %s. Assuming the trace for tensor deallocation " + "wasn't caught and backfilling it now. " + "Perhaps the sanitizer was enabled after some torch operations?", + data_ptr, + ) + self.delete_tensor(data_ptr) + + def create_tensor( + self, data_ptr: DataPtr, stack_trace: Optional[traceback.StackSummary] + ) -> None: + self.accesses[data_ptr] = TensorInfo(stack_trace) + + def delete_tensor(self, data_ptr: DataPtr) -> None: + del self.accesses[data_ptr] + + def were_there_reads_since_last_write(self, data_ptr: DataPtr) -> bool: + return True if self.accesses[data_ptr].reads else False + + def get_allocation_stack_trace( + self, data_ptr: DataPtr + ) -> Optional[traceback.StackSummary]: + return self.accesses[data_ptr].allocation_stack_trace + + def get_write(self, data_ptr: DataPtr) -> Optional[Access]: + return self.accesses[data_ptr].write + + def get_reads(self, data_ptr: DataPtr) -> List[Access]: + return self.accesses[data_ptr].reads + + def add_read(self, data_ptr: DataPtr, access: Access) -> None: + self.accesses[data_ptr].reads.append(access) + + def set_write(self, data_ptr: DataPtr, access: Access) -> None: + self.accesses[data_ptr].write = access + self.accesses[data_ptr].reads = [] + + +class StreamSynchronizations: + def __init__(self): + self.current_sync_states: Dict[StreamId, Dict[StreamId, SeqNum]] = {} + self.recorded_sync_states: Dict[EventId, Dict[StreamId, SeqNum]] = {} + self.host_sync_state: Dict[StreamId, SeqNum] = {} + self.create_stream(DEFAULT_STREAM_ID) + + def _ensure_stream_exists(self, stream: StreamId) -> None: + if stream not in self.current_sync_states: + logger.info( + "Found Stream with id: %s, but no matching stream " + "creation in the trace. Backfilling the trace now. " + "Perhaps the sanitizer was enabled after some torch operations?", + stream, + ) + self.create_stream(stream) + + def _ensure_event_exists(self, event: EventId) -> None: + if event not in self.recorded_sync_states: + logger.info( + "Found Event with id: %s, but no matching event " + "creation in the trace. Backfilling the trace now. " + "Perhaps the sanitizer was enabled after some torch operations?", + event, + ) + self.create_event(event) + + def _ensure_event_does_not_exist(self, event: EventId) -> None: + if event in self.recorded_sync_states: + logger.info( + "Found duplicate event creation in the trace for event with " + "id: %s. Assuming the trace for event deletion wasn't caught " + "and backfilling it now. " + "Perhaps the sanitizer was enabled after some torch operations?", + event, + ) + self.delete_event(event) + + def create_stream(self, stream: StreamId) -> None: + if stream in self.current_sync_states: + logger.info( + "Found duplicate Stream creation in the trace for Stream with " + "id: %s. PyTorch Streams are only created once, so this " + "trace entry is ignored.", + stream, + ) + else: + self.host_sync_state[stream] = 0 + self.current_sync_states[stream] = self.host_sync_state.copy() + + def create_event(self, event: EventId) -> None: + self._ensure_event_does_not_exist(event) + self.recorded_sync_states[event] = {} + + def delete_event(self, event: EventId) -> None: + self._ensure_event_exists(event) + del self.recorded_sync_states[event] + + def update_seq_num(self, stream: StreamId, seq_num: SeqNum) -> None: + self._ensure_stream_exists(stream) + self.current_sync_states[stream][stream] = seq_num + + def record_state(self, event: EventId, stream: StreamId) -> None: + self._ensure_event_exists(event) + self._ensure_stream_exists(stream) + self.recorded_sync_states[event] = self.current_sync_states[stream].copy() + + def _state_wait_for_other( + self, state: Dict[StreamId, SeqNum], other: Dict[StreamId, SeqNum] + ) -> None: + for stream, seq_num in other.items(): + state[stream] = max(state.get(stream, -1), seq_num) + + def stream_wait_for_event(self, stream: StreamId, event: EventId) -> None: + self._ensure_stream_exists(stream) + self._ensure_event_exists(event) + self._state_wait_for_other( + self.current_sync_states[stream], self.recorded_sync_states[event] + ) + + def all_streams_wait_for_event(self, event: EventId) -> None: + self._ensure_event_exists(event) + for stream in self.current_sync_states.keys(): + self.stream_wait_for_event(stream, event) + + self._state_wait_for_other( + self.host_sync_state, self.recorded_sync_states[event] + ) + + def all_streams_wait_for_stream(self, stream: StreamId) -> None: + self._ensure_stream_exists(stream) + for state in self.current_sync_states.values(): + self._state_wait_for_other(state, self.current_sync_states[stream]) + + self._state_wait_for_other( + self.host_sync_state, self.current_sync_states[stream] + ) + + def sync_all_streams(self) -> None: + for stream, state in self.current_sync_states.items(): + self.host_sync_state[stream] = state[stream] + + for state in self.current_sync_states.values(): + self._state_wait_for_other(state, self.host_sync_state) + + def is_ordered_after( + self, current_stream: StreamId, seq_num: SeqNum, other_stream: StreamId + ) -> bool: + self._ensure_stream_exists(current_stream) + self._ensure_stream_exists(other_stream) + return seq_num <= self.current_sync_states[current_stream].get(other_stream, -1) + + +class EventHandler: + """Analyzes CSAN trace for synchronization errors. + + Stores information on each stream's synchronizations with other streams as well + as tensor accesses to determine whether a given kernel launch might cause a + data race. + """ + + def __init__(self): + self.tensors_accessed = _TensorsAccessed() + self.syncs = StreamSynchronizations() + self.seq_num: SeqNum = 0 + + def _handle_kernel_launch( + self, + stream: StreamId, + read_only: Set[DataPtr], + read_write: Set[DataPtr], + outputs: Set[DataPtr], + operator: str, + tensor_aliases: Dict[int, List[str]], + ) -> List[SynchronizationError]: + def check_conflict( + data_ptr: DataPtr, current_access: Access, previous_access: Optional[Access] + ) -> None: + if previous_access is None: + return + if not self.syncs.is_ordered_after( + current_access.stream, previous_access.seq_num, previous_access.stream + ): + error_list.append( + UnsynchronizedAccessError( + data_ptr, + self.tensors_accessed.get_allocation_stack_trace(data_ptr), + current_access, + previous_access, + ) + ) + + error_list: List[SynchronizationError] = [] + self.seq_num += 1 + self.syncs.update_seq_num(stream, self.seq_num) + stack_trace = traceback.StackSummary.extract( + traceback.walk_stack(inspect.currentframe()), lookup_lines=False + ) + # The stack trace generated in this way is in the inverse order, so it must be + # reversed. + stack_trace.reverse() + + for data_ptr in read_only: + self.tensors_accessed.ensure_tensor_exists(data_ptr) + current_access = Access( + AccessType.READ, + self.seq_num, + stream, + operator, + tensor_aliases[data_ptr], + data_ptr in outputs, + stack_trace, + ) + check_conflict( + data_ptr, current_access, self.tensors_accessed.get_write(data_ptr) + ) + self.tensors_accessed.add_read(data_ptr, current_access) + + for data_ptr in read_write: + self.tensors_accessed.ensure_tensor_exists(data_ptr) + current_access = Access( + AccessType.WRITE, + self.seq_num, + stream, + operator, + tensor_aliases[data_ptr], + data_ptr in outputs, + stack_trace, + ) + if self.tensors_accessed.were_there_reads_since_last_write(data_ptr): + for previous_access in self.tensors_accessed.get_reads(data_ptr): + check_conflict(data_ptr, current_access, previous_access) + else: + check_conflict( + data_ptr, current_access, self.tensors_accessed.get_write(data_ptr) + ) + self.tensors_accessed.set_write(data_ptr, current_access) + + return error_list + + def _handle_event_creation(self, event: EventId) -> None: + self.syncs.create_event(event) + + def _handle_event_deletion(self, event: EventId) -> None: + self.syncs.delete_event(event) + + def _handle_event_record(self, event: EventId, stream: StreamId) -> None: + self.syncs.record_state(event, stream) + + def _handle_event_wait(self, event: EventId, stream: StreamId) -> None: + self.syncs.stream_wait_for_event(stream, event) + + def _handle_memory_allocation(self, data_ptr: DataPtr) -> None: + self.tensors_accessed.ensure_tensor_does_not_exist(data_ptr) + stack_trace = traceback.StackSummary.extract( + traceback.walk_stack(inspect.currentframe()), lookup_lines=False + ) + # The stack trace generated in this way is in the inverse order, so it must be + # reversed. + stack_trace.reverse() + self.tensors_accessed.create_tensor( + data_ptr, + stack_trace, + ) + + def _handle_memory_deallocation(self, data_ptr: DataPtr) -> None: + self.tensors_accessed.ensure_tensor_exists(data_ptr) + self.tensors_accessed.delete_tensor(data_ptr) + + def _handle_stream_creation(self, stream: StreamId) -> None: + self.syncs.create_stream(stream) + + def _handle_device_synchronization(self) -> None: + self.syncs.sync_all_streams() + + def _handle_stream_synchronization(self, stream: StreamId) -> None: + self.syncs.all_streams_wait_for_stream(stream) + + def _handle_event_synchronization(self, event: EventId) -> None: + self.syncs.all_streams_wait_for_event(event) + + +def zip_by_key(a: Dict[TK, TVa], b: Dict[TK, TVb]) -> Iterator[Tuple[TK, TVa, TVb]]: + for arg, value in a.items(): + if arg in b: + yield arg, value, b[arg] + + +def zip_arguments( + schema: torch.FunctionSchema, args: Tuple[Any, ...], kwargs: Dict[str, Any] +) -> Iterator[Tuple[torch.Argument, Any]]: + schema_args = schema.arguments[: len(args)] + schema_kwargs = {arg.name: arg for arg in schema.arguments[len(args) :]} + + yield from zip(schema_args, args) + + for _, argument, value in zip_by_key(schema_kwargs, kwargs): + yield (argument, value) + + +class ArgumentHandler: + def __init__(self): + self.dataptrs_read: Set[DataPtr] = set() + self.dataptrs_written: Set[DataPtr] = set() + self.tensor_aliases: Dict[DataPtr, List[str]] = dict() + self.outputs: Set[DataPtr] = set() + + def _handle_argument( + self, + value: Any, + is_write: bool, + name: Optional[str] = None, + is_output: bool = False, + ) -> None: + if isinstance(value, torch.Tensor) and value.is_cuda: + data_ptr = value.data_ptr() + if is_write: + self.dataptrs_written.add(data_ptr) + else: + self.dataptrs_read.add(data_ptr) + + self.tensor_aliases.setdefault(data_ptr, []) + if name is not None: + self.tensor_aliases[data_ptr].append(name) + if is_output: + self.outputs.add(data_ptr) + + def parse_inputs( + self, + schema: torch.FunctionSchema, + args: Tuple[Any, ...], + kwargs: Dict[str, Any], + ) -> None: + for argument, value in zip_arguments(schema, args, kwargs): + is_write = argument.alias_info is not None and argument.alias_info.is_write + pytree.tree_map_( + functools.partial( + self._handle_argument, is_write=is_write, name=argument.name + ), + value, + ) + + def parse_outputs(self, outputs: Any) -> None: + pytree.tree_map_( + functools.partial(self._handle_argument, is_write=True, is_output=True), + outputs, + ) + + +class CUDASanitizerDispatchMode(TorchDispatchMode): + def __init__(self): + self.event_handler = EventHandler() + torch._C._activate_cuda_trace() + cuda_trace.register_callback_for_cuda_event_creation( + self.event_handler._handle_event_creation + ) + cuda_trace.register_callback_for_cuda_event_deletion( + self.event_handler._handle_event_deletion + ) + cuda_trace.register_callback_for_cuda_event_record( + self.event_handler._handle_event_record + ) + cuda_trace.register_callback_for_cuda_event_wait( + self.event_handler._handle_event_wait + ) + cuda_trace.register_callback_for_cuda_memory_allocation( + self.event_handler._handle_memory_allocation + ) + cuda_trace.register_callback_for_cuda_memory_deallocation( + self.event_handler._handle_memory_deallocation + ) + cuda_trace.register_callback_for_cuda_stream_creation( + self.event_handler._handle_stream_creation + ) + cuda_trace.register_callback_for_cuda_device_synchronization( + self.event_handler._handle_device_synchronization + ) + cuda_trace.register_callback_for_cuda_stream_synchronization( + self.event_handler._handle_stream_synchronization + ) + cuda_trace.register_callback_for_cuda_event_synchronization( + self.event_handler._handle_event_synchronization + ) + + def __torch_dispatch__(self, func, types, args=(), kwargs=None): + if kwargs is None: + kwargs = {} + + argument_handler = ArgumentHandler() + argument_handler.parse_inputs(func._schema, args, kwargs) + + outputs = func(*args, **kwargs) + + argument_handler.parse_outputs(outputs) + errors = self.event_handler._handle_kernel_launch( + torch.cuda.current_stream().cuda_stream, + argument_handler.dataptrs_read - argument_handler.dataptrs_written, + argument_handler.dataptrs_written, + argument_handler.outputs, + func._schema, + argument_handler.tensor_aliases, + ) + if errors: + for error in errors: + print(error, file=sys.stderr) + raise CUDASanitizerErrors(errors) + + return outputs + + +class CUDASanitizer: + """Manages the lifetime of a CUDASanitizer dispatch mode object. + + The CUDASanitizer class wraps the entering/exiting functions of the dispatch mode + context manager in the enable function/destructor, respectively. This is to + explicitly set the lifetime of the dispatch mode object to that of the application. + This approach was deemed more elegant than using the atexit module. + """ + + def __init__(self): + self.dispatch = CUDASanitizerDispatchMode() + self.enabled = False + + def enable(self): + self.dispatch.__enter__() + self.enabled = True + + def __del__(self): + if self.enabled: + self.dispatch.__exit__(None, None, None) + + +def enable_cuda_sanitizer(): + """Enable CUDA Sanitizer. + + The sanitizer will begin to analyze low-level CUDA calls invoked by torch functions + for synchronization errors. All data races found will be printed to the standard + error output along with stack traces of suspected causes. For best results, the + sanitizer should be enabled at the very beginning of the program. + """ + cuda_sanitizer.enable() + + +cuda_sanitizer = CUDASanitizer() diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/amp/__pycache__/grad_scaler.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/amp/__pycache__/grad_scaler.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb9ee0705b4a8b3296e4717f78396cf8ae2b5ece Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/amp/__pycache__/grad_scaler.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/comm.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/comm.py new file mode 100644 index 0000000000000000000000000000000000000000..2ea23c2072d86a61db643fcfbfb799e97267e5e9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/comm.py @@ -0,0 +1,18 @@ +# The functions here have been moved to torch.nn.parallel.comm +from torch.nn.parallel.comm import ( + broadcast, + broadcast_coalesced, + gather, + reduce_add, + reduce_add_coalesced, + scatter, +) + +__all__ = [ + "broadcast", + "broadcast_coalesced", + "reduce_add", + "reduce_add_coalesced", + "scatter", + "gather", +] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/memory.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/memory.py new file mode 100644 index 0000000000000000000000000000000000000000..60440c58dc1d057b744fc91a6254757b74839225 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/memory.py @@ -0,0 +1,914 @@ +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature + +from typing import Any, Dict, Optional, Tuple, Union + +import torch +from torch import _C + +from torch.types import Device +from .._utils import _dummy_type +from . import _get_device_index, _get_nvml_device_index, _lazy_init, is_initialized + +from ._memory_viz import memory as _memory, segments as _segments + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> Dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> Dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + warnings.warn( + "torch.cuda.memory_cached has been renamed to torch.cuda.memory_reserved", + FutureWarning, + ) + return memory_reserved(device=device) + + +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + warnings.warn( + "torch.cuda.max_memory_cached has been renamed to torch.cuda.max_memory_reserved", + FutureWarning, + ) + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + " {:<21} | {} | {} | {} | {} ".format( + submetric_name, + formatter(current, current_prefval), + formatter(peak, peak_prefval), + formatter(allocated, allocated_prefval), + formatter(freed, freed_prefval), + ), + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + " {:<21} | {} | {} | {} | {} ".format( + metric_name, + formatter(current, current), + formatter(peak, peak), + formatter(allocated, allocated), + formatter(freed, freed), + ), + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + mem = p.usedGpuMemory / (1024 * 1024) + lines.append(f"process {p.pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> Tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + ) + + +def _record_memory_history(enabled="all", *args, **kwargs): + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/_compatibility.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/_compatibility.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cceaa42fd40faee5b2edc08bd0971a003eef434e Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/_compatibility.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/_symbolic_trace.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/_symbolic_trace.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a610ac0386e856f0cb8d45596c9216381063379a Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/_symbolic_trace.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/annotate.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/annotate.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..80f345d9f55c6cf2b28038dc3a9f537758c190b6 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/annotate.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/config.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48a9d1734665287fde13236e8937c67a054264b4 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/config.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/node.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/node.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a0ea80f6ddf1f353c9d65b9611a34eb96cd915e1 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/node.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/traceback.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/traceback.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fa49fe2a3fbf0d33768b475eaf6d70ced19e43b6 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/traceback.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/fake_tensor_prop.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/fake_tensor_prop.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7ef41bb4a298c12acfd5f112bdec5ff17391c22e Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/fake_tensor_prop.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/net_min_base.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/net_min_base.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d9f6b5b9c34c92ecca468f34d393c74b7022a4c4 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/net_min_base.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/param_fetch.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/param_fetch.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..07b94a049216103d8fd9b66c57b802701db3afa2 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/param_fetch.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/split_utils.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/split_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e99a6c7683bee805285ad20b5e027209c9aea513 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/split_utils.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/splitter_base.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/splitter_base.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e05f43ec71663ccc219e1cff260122a59f04d6e7 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/splitter_base.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/tools_common.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/tools_common.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e199b0e262bd724f4786adacd3a79b7e0b2b385 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/tools_common.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/backends/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/backends/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6595359d20574b5ef4d99297044415e60396ba5b Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/backends/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/backends/cudagraphs.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/backends/cudagraphs.py new file mode 100644 index 0000000000000000000000000000000000000000..d423de930dc7e8480a4f629ebc542782b0576b21 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/backends/cudagraphs.py @@ -0,0 +1,56 @@ +import torch +from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner +from torch.fx.passes.operator_support import OperatorSupport +from torch.fx.passes.tools_common import CALLABLE_NODE_OPS +from torch.fx.passes.fake_tensor_prop import FakeTensorProp +from torch.utils import _pytree as pytree + +import operator + +class CudaGraphsSupport(OperatorSupport): + # TODO: why is submodules passed here + def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: + if node.op not in CALLABLE_NODE_OPS: + return False + + if node.target in [torch.ops.aten.embedding_dense_backward.default]: + return False + + if node.target in [operator.getitem]: + return True + + found_not_cuda = False + + def meta_fk(meta): + return meta["val"] if "val" in meta else meta["fake_result"] + + def find_not_cuda(t): + nonlocal found_not_cuda + if isinstance(t, torch.Tensor) and t.device.type != 'cuda': + found_not_cuda = True + + for n in node.all_input_nodes: + pytree.tree_map_(find_not_cuda, meta_fk(n.meta)) + + pytree.tree_map_(find_not_cuda, meta_fk(node.meta)) + + # NB: factory function is accounted for because the result would be + # cpu or cuda + + return not found_not_cuda + +def partition_cudagraphs(gm, inputs): + """ + Partition an FX graph into sub-GraphModules that can be validly run under + CUDA graphs. For a subgraph to be runnable under CUDA, all of the operations + must involve CUDA tensors only/ + """ + + FakeTensorProp(gm).propagate(*inputs) + supported_ops = CudaGraphsSupport() + # TODO: single node partition may be wrong due to the pessimization + # from copying in and out the data. Check in benchmarks, perhaps + partitioner = CapabilityBasedPartitioner(gm, supported_ops, allows_single_node_partition=True) + partitions = partitioner.propose_partitions() + fused_graph = partitioner.fuse_partitions(partitions) + return fused_graph diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/infra/__pycache__/pass_base.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/infra/__pycache__/pass_base.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..472c5b4f2ac04ad832913d974333d86fd827a272 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/infra/__pycache__/pass_base.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/tests/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/tests/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..06ae7e3164f22c056bf9b810086305bdb97dddfb Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/tests/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/tests/__pycache__/test_pass_manager.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/tests/__pycache__/test_pass_manager.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d0ed6c0738e1c90d5360ca5593a5bbdd268afea Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/tests/__pycache__/test_pass_manager.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/tests/test_pass_manager.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/tests/test_pass_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..60ed6671179b2c20fa0be176631d1415009ee87a --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/tests/test_pass_manager.py @@ -0,0 +1,58 @@ +import unittest + +from ..pass_manager import ( + inplace_wrapper, + PassManager, + these_before_those_pass_constraint, + this_before_that_pass_constraint, +) + + +class TestPassManager(unittest.TestCase): + def test_pass_manager_builder(self) -> None: + passes = [lambda x: 2 * x for _ in range(10)] + pm = PassManager(passes) + pm.validate() + + def test_this_before_that_pass_constraint(self) -> None: + passes = [lambda x: 2 * x for _ in range(10)] + pm = PassManager(passes) + + # add unfulfillable constraint + pm.add_constraint(this_before_that_pass_constraint(passes[-1], passes[0])) + + self.assertRaises(RuntimeError, pm.validate) + + def test_these_before_those_pass_constraint(self) -> None: + passes = [lambda x: 2 * x for _ in range(10)] + constraint = these_before_those_pass_constraint(passes[-1], passes[0]) + pm = PassManager( + [inplace_wrapper(p) for p in passes] + ) + + # add unfulfillable constraint + pm.add_constraint(constraint) + + self.assertRaises(RuntimeError, pm.validate) + + def test_two_pass_managers(self) -> None: + """Make sure we can construct the PassManager twice and not share any + state between them""" + + passes = [lambda x: 2 * x for _ in range(3)] + constraint = these_before_those_pass_constraint(passes[0], passes[1]) + pm1 = PassManager() + for p in passes: + pm1.add_pass(p) + pm1.add_constraint(constraint) + output1 = pm1(1) + self.assertEqual(output1, 2 ** 3) + + passes = [lambda x: 3 * x for _ in range(3)] + constraint = these_before_those_pass_constraint(passes[0], passes[1]) + pm2 = PassManager() + for p in passes: + pm2.add_pass(p) + pm2.add_constraint(constraint) + output2 = pm2(1) + self.assertEqual(output2, 3 ** 3) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2a7970ba4c283e851430ed0025e1ed5c772eb7b1 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__init__.py @@ -0,0 +1 @@ +from .common import lift_subgraph_as_module, HolderModule, compare_graphs diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__pycache__/common.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__pycache__/common.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d9a85eb023713d04f82f8b86758454941608283 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__pycache__/common.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__pycache__/fuser_utils.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__pycache__/fuser_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..642c0de09f7ecee8923217939ee566ddb14ffc0d Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__pycache__/fuser_utils.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__pycache__/matcher_with_name_node_map_utils.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__pycache__/matcher_with_name_node_map_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ffb9f6f04dc0a813730bd986818e2446f5b40a0 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__pycache__/matcher_with_name_node_map_utils.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/fuser_utils.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/fuser_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8fb328e8596d50bd3d996ff2b2948b1e83c7f05f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/fuser_utils.py @@ -0,0 +1,233 @@ +import copy +from queue import SimpleQueue +from typing import List, Dict, Tuple + +import torch.fx +from torch.fx.graph_module import GraphModule +from torch.fx.graph import Graph +from torch.fx.node import Node +from torch.fx.passes.tools_common import NodeList, NodeSet, legalize_graph +from torch.fx.passes.utils import lift_subgraph_as_module +from torch.fx._compatibility import compatibility + +@compatibility(is_backward_compatible=False) +def topo_sort(nodes: NodeList) -> NodeList: + # sort nodes according to the topological order + indegree_map = dict.fromkeys(nodes, 0) + candidates: SimpleQueue = SimpleQueue() + + for node in nodes: + for n in node.all_input_nodes: + if n in indegree_map: + indegree_map[node] += 1 + if indegree_map[node] == 0: + candidates.put(node) + + sorted_nodes: NodeList = list() + while not candidates.empty(): + node = candidates.get() + sorted_nodes.append(node) + + for n in node.users: + if n in indegree_map: + indegree_map[n] -= 1 + if indegree_map[n] == 0: + candidates.put(n) + + assert len(nodes) == len(sorted_nodes), "topological sorted nodes doesn't have same length as input nodes" + + return sorted_nodes + + +@compatibility(is_backward_compatible=False) +def validate_partition(partition: NodeList) -> bool: + # verify the partition does't form a dependency cycle in the original graph + # returns True for valid partition, False for invalid + + partition_set = set(partition) + + outputs: NodeList = list() + for node in partition_set: + for user_node in node.users: + if user_node not in partition_set: + # external user node, need to expose as an output + outputs.append(user_node) + + # Perform BFS on the partition outputs. + # If it reaches a node within the partition, then it found a cycle. + # This function takes the ownership of `root_nodes` and may modify it. + def bfs_find_cycle(root_nodes: NodeList) -> bool: + # Set used to exclude nodes that have already been visited. + # If a node has been visited, that node and all its children have + # been checked for cycles. + visited: NodeSet = set() + + # Start with `root_nodes` and traverse through (toward child nodes) + # their connected sub-graph. Nodes in `visited` won't be added + # to `queue` again. + queue: NodeList = root_nodes + while queue: + current = queue.pop() + visited.add(current) + if current in partition_set: + # Started from partition's `output` nodes, and reached + # another node in partition. Cycle! + return True + for user_node in current.users: + if user_node in visited: + continue + queue.append(user_node) + # `root_nodes` don't cause cycle. + return False + + # Use all output nodes as roots to traverse + # the graph to check cycles. + if bfs_find_cycle(outputs): + return False + + return True + + +@compatibility(is_backward_compatible=False) +def fuse_as_graphmodule(gm: GraphModule, + nodes: NodeList, + module_name: str) -> Tuple[GraphModule, Tuple[Node, ...], Tuple[Node, ...]]: + + """ + Fuse nodes in graph_module into a GraphModule. + + Args: + gm (GraphModule): target graph_module + + nodes (List[Node]): list of nodes in `gm` to fuse, where the node must be topologically sorted + + module_name: class name for the fused GraphModule + + Returns: + fused_gm (GraphModule): fused graph module, where its node is a copy of `nodes` in `gm` + + original_inputs (Tuple[Node, ...]): input nodes to `nodes` in original `gm` + + original_outputs (Tuple[Node, ...]): consumer nodes of `nodes` in original `gm` + + """ + + # assumption: nodes are already sorted in topo order + + for node in nodes: + assert node.graph.owning_module is gm, f"{node} doesn't belong to passed in graph module {gm._get_name()}" + assert not node._erased, f"{node} has been removed from owning graph" + assert node in gm.graph.nodes, f"{node} is not found in graph module {gm._get_name()}" + + # validates partition doesn't introduce dependency circles in the graph + assert validate_partition(nodes), "Invalid partition, found dependency cycles" + + subgraph = Graph() + + node_to_placeholder: Dict[Node, Node] = {} # mapping of nodes from old graph to placeholder in new graph + node_map: Dict[Node, Node] = {} # mapping of nodes from old graph to new graph + + # handles inputs through graph.node_copy's arg_transform functions + def remap_inputs(x): + if x.op == "get_attr": + # TODO: do we really need copy the get_attr node into the graph? + # do something here + pass + + if x in nodes: + # x is inside subgraph, return the copied node + # the node should have been copied aleady, as we are copying graph in the topological order + return node_map[x] + + if x not in node_to_placeholder: + # x is not in subgraph, create a new placeholder for subgraph + placeholder_node = subgraph.placeholder(x.name, type_expr=x.type) + # copy all meta fields, even if some fields might be irrelvant for the placeholder node + placeholder_node.meta = copy.copy(x.meta) + node_to_placeholder[x] = placeholder_node + + return node_to_placeholder[x] + + # copy nodes in topological order + for node in nodes: + new_node = subgraph.node_copy(node, remap_inputs) + node_map[node] = new_node + + # handles outputs + output_mapping: Dict[Node, Node] = {} # mapping from old output to new outputs + + for node in nodes: + for user_node in node.users: + if user_node not in nodes: + # external user node, need to expose as an output + output_mapping[node] = node_map[node] + + # outs contain nodes in the new subgraph + outs = tuple(output_mapping.values()) + + # Take care of the args of FX output node. If there's a single + # output then the output node args is like (output_single), else + # if there're multiple outputs then the output node args is like + # ((output_0, output_1, ...)). + subgraph.output(outs[0] if len(outs) == 1 else outs) + + # lint to ensure correctness + subgraph.lint() + fused_gm: GraphModule + fused_gm, _ = lift_subgraph_as_module(gm, subgraph, comp_name="", class_name=module_name) + + # sub_gm's input nodes in the original module + original_inputs: Tuple[Node, ...] = tuple(node_to_placeholder.keys()) + + # sub_gm's outputs node in the original module + original_outputs: Tuple[Node, ...] = tuple(output_mapping.keys()) + + return fused_gm, original_inputs, original_outputs + + +@compatibility(is_backward_compatible=False) +def insert_subgm(gm: GraphModule, sub_gm: GraphModule, orig_inputs: Tuple[Node, ...], orig_outputs: Tuple[Node, ...]): + # add sub_gm into gm + submodule_name = sub_gm.__class__.__name__ + gm.add_submodule(submodule_name, sub_gm) + + # Create a call_module node in main graph. + module_node = gm.graph.call_module( + submodule_name, + args=orig_inputs, + kwargs=None) + + if len(orig_outputs) == 1: + # main_remapping[comp.orig_outputs[0]] = module_node + orig_outputs[0].replace_all_uses_with(module_node, propagate_meta=True) + else: + for i, orig_output in enumerate(orig_outputs): + # Use Proxy to record getitem access. + proxy_out = torch.fx.Proxy(module_node)[i].node # type: ignore[index] + orig_output.replace_all_uses_with(proxy_out, propagate_meta=True) + return gm + +@compatibility(is_backward_compatible=False) +def erase_nodes(gm: GraphModule, nodes: NodeList): + + # erase original nodes in inversed topological order + for node in reversed(nodes): + gm.graph.erase_node(node) + + +@compatibility(is_backward_compatible=False) +def fuse_by_partitions(gm: GraphModule, partitions: List[NodeList]) -> GraphModule: + for partition_id, nodes in enumerate(partitions): + sorted_nodes = topo_sort(nodes) + + submodule_name = "fused_" + str(partition_id) + sub_gm, orig_inputs, orig_outputs = fuse_as_graphmodule(gm, sorted_nodes, submodule_name) + + insert_subgm(gm, sub_gm, orig_inputs, orig_outputs) + + erase_nodes(gm, sorted_nodes) + + # topological sort original gm with newly created sub_gm + legalize_graph(gm) + + return gm diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/matcher_with_name_node_map_utils.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/matcher_with_name_node_map_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c27fe5599639e9f5235b57dcdc94c71d93019e87 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/matcher_with_name_node_map_utils.py @@ -0,0 +1,113 @@ +from typing import Dict, List, Tuple + +from torch.fx import Graph, GraphModule, Node + +from torch.fx._compatibility import compatibility +from .matcher_utils import InternalMatch, SubgraphMatcher + +__all__ = ["SubgraphMatcherWithNameNodeMap"] + + +def _split_to_graph_and_name_node_map( + gm: GraphModule, +) -> Tuple[GraphModule, Dict[str, Node]]: + from torch.fx.graph import _PyTreeInfo + from torch.utils._pytree import tree_flatten, tree_unflatten + + name_node_map = {} + for n in gm.graph.nodes: + if n.op == "output": + assert gm._out_spec is not None + output = tree_unflatten(n.args[0], gm._out_spec) + assert isinstance( + output, tuple + ), "Expecting the pattern graph to return a tuple" + assert ( + len(output) >= 2 + ), "Expecting the pattern graph to have at least two outputs" + *out, name_node_map = output + flattened, out_spec = tree_flatten(out) + assert isinstance( + name_node_map, Dict + ), "Expecting the input graph to have a dict output as the last element" + n.args = (flattened,) + orig_pytree_info = gm._graph._codegen.pytree_info + gm._graph._codegen.pytree_info = _PyTreeInfo( + orig_pytree_info.orig_args, orig_pytree_info.in_spec, out_spec + ) + gm.recompile() + return gm, name_node_map + + +@compatibility(is_backward_compatible=False) +class SubgraphMatcherWithNameNodeMap(SubgraphMatcher): + """Extends SubgraphMatcher to support querying the matched subgraph nodes through node name, + this requires pattern to have specific format (returning and additional dictionary at the output, + that has node name as key, and the node in the pattern graph as value, see Example for more details) + + Difference with SubgraphMatcher is that it takes a `pattern_gm` GraphModule as input during + initialization since we need to modify the graph (which requires `recompile` the GraphModule) + + Example:: + def pattern(x, weight): + conv = F.conv2d(x, weight) + relu = F.relu(conv) + return relu, {"conv": conv, "relu": relu} + + def target_graph(x, weight): + conv = F.conv2d(x, weight) + relu = F.relu(conv) + relu *= 2 + return relu + + pattern_gm = capture_pre_autograd_graph(pattern, example_inputs) + target_gm = capture_pre_autograd_graph(target_graph, example_inputs) + matcher = SubgraphMatcherWithNameNodeMap(pattern_gm) + matches = matcher.match(target_gm) + for match in matches: + match.name_node_map["conv"].meta["annotation"] = ... + + """ + + def __init__( + self, + pattern_gm: GraphModule, + match_output: bool = False, + match_placeholder: bool = False, + remove_overlapping_matches: bool = True, + ignore_literals: bool = False, + ) -> None: + pattern_gm, name_node_map = _split_to_graph_and_name_node_map(pattern_gm) + self.name_node_map = name_node_map + super().__init__( + pattern_gm.graph, + match_output, + match_placeholder, + remove_overlapping_matches, + ignore_literals, + ) + + def match(self, graph: Graph) -> List[InternalMatch]: + """The returned InternalMatch will have name_node_map populated with a map + from node name (str) to the target node, e.g. + {"conv": target_conv_ndoe, "relu": target_relu_node} + + this requires the pattern graph returns an additional + output of node name to node, e.g. instead of: + ``` + def pattern(...): + ... + return relu + ``` + we should do: + ``` + def pattern(...): + ... + return relu, {"conv": conv, "relu": relu} + ``` instead + """ + internal_matches = super().match(graph) + for internal_match in internal_matches: + for k, n in self.name_node_map.items(): + internal_match.name_node_map[k] = internal_match.nodes_map[n] + return internal_matches diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CPUApplyUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CPUApplyUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..5c524ef97c475a0529b7b18c430be0d39c350aa4 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CPUApplyUtils.h @@ -0,0 +1,343 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace at { + +/* + * The basic strategy for apply is as follows: + * + * 1. Starting with the outermost index, loop until we reach a dimension where + * the data is no longer contiguous, i.e. the stride at that dimension is not + * equal to the size of the tensor defined by the outer dimensions. Let's call + * this outer (contiguous) tensor A. Note that if the Tensor is contiguous, then + * A is equal to the entire Tensor. Let's call the inner tensor B. + * + * 2. We loop through the indices in B, starting at its outermost dimension. For + * example, if B is a 2x2 matrix, then we do: + * + * B[0][0] + * B[0][1] + * B[1][0] + * B[1][1] + * + * We set the offset into the underlying storage as (storageOffset + stride_B * + * index_B), i.e. basically we compute the offset into the storage as we would + * normally for a Tensor. But because we are guaranteed the subsequent data is + * contiguous in memory, we can simply loop for sizeof(A) iterations and perform + * the operation, without having to follow the order described by the strides of + * A. + * + * 3. As an optimization, we merge dimensions of A that are contiguous in + * memory. For example, if A is a 3x3x3x3 tensor narrowed from a 3x3x4x3 tensor, + * then the first two dimensions can be merged for the purposes of APPLY, + * reducing the number of nested loops. + */ + +inline Tensor sort_strides(Tensor& tensor_) { + IntArrayRef strides = tensor_.strides(); + std::vector indices; + indices.reserve(tensor_.ndimension()); + for (const auto i : c10::irange(tensor_.ndimension())) { + indices.push_back(i); + } + std::sort(indices.begin(), indices.end(), [&strides](int64_t i1, int64_t i2) { + return strides[i1] > strides[i2]; + }); + Tensor tensor = tensor_.permute(indices); + return tensor; +} + +template +struct strided_tensor_iter_fixed { + public: + T* data_ = NULL; + int64_t dim_ = 0; + + int64_t counter_[N] = {0}; + int64_t sizes_[N] = {0}; + int64_t strides_[N] = {0}; + + strided_tensor_iter_fixed(strided_tensor_iter_fixed const&) = delete; + void operator=(strided_tensor_iter_fixed const& x) = delete; + strided_tensor_iter_fixed(strided_tensor_iter_fixed&&) = default; + strided_tensor_iter_fixed( + Tensor& tensor, + C10_UNUSED bool sort_strides = false) + : data_(tensor.data_ptr()) { + std::memset(counter_, 0, sizeof(int64_t) * N); + if (tensor.dim() > 0) { + std::memcpy( + sizes_, tensor.sizes().data(), tensor.dim() * sizeof(int64_t)); + std::memcpy( + strides_, tensor.strides().data(), tensor.dim() * sizeof(int64_t)); + } + dim_ = std::get<1>(collapse_dims(sizes_, strides_, tensor.ndimension())); + } +}; + +template +struct strided_tensor_iter { + private: + public: + T* data_ = NULL; + int64_t dim_; + + std::vector counter_; + std::vector sizes_; + std::vector strides_; + + strided_tensor_iter(strided_tensor_iter const&) = delete; + void operator=(strided_tensor_iter const& x) = delete; + strided_tensor_iter(strided_tensor_iter&&) = default; + strided_tensor_iter(Tensor& tensor) + : data_(tensor.data_ptr()), + dim_(tensor.ndimension()), + counter_(dim_, 0), + sizes_(tensor.sizes().vec()), + strides_(tensor.strides().vec()) { + dim_ = std::get<1>(collapse_dims(sizes_.data(), strides_.data(), dim_)); + } +}; + +inline bool _all_equal_numel(at::ArrayRef tensors) { + if (tensors.empty()) + return true; + int64_t all_numel = tensors[0].numel(); + for (const auto i : c10::irange(1, tensors.size())) { + if (tensors[i].numel() != all_numel) + return false; + } + return true; +} + +inline std::string _all_equal_numel_error(at::ArrayRef tensors) { + std::ostringstream oss; + oss << "inconsistent tensor size, expected "; + for (size_t i = 0; i < tensors.size() - 1; i++) { + oss << tensors[i].sizes() << ", "; + } + oss << "and " << tensors[tensors.size() - 1].sizes() + << " to have the same number of elements, but got "; + for (size_t i = 0; i < tensors.size() - 1; i++) { + oss << tensors[i].numel() << ", "; + } + oss << "and " << tensors[tensors.size() - 1].numel() + << " elements respectively"; + return oss.str(); +} + +inline bool _apply_preamble(ArrayRef tensors) { + checkDeviceType("CPU_tensor_apply", tensors, kCPU); + checkLayout("CPU_tensor_apply", tensors, kStrided); + if (!_all_equal_numel(tensors)) + AT_ERROR(_all_equal_numel_error(tensors)); + // An empty tensor has no elements + for (auto& t : tensors) + if (t.numel() == 0) + return false; + return true; +} + +inline int64_t _max_dim_tensors(ArrayRef tensors) { + int64_t dim = 0; + for (auto& t : tensors) + dim = std::max(dim, t.ndimension()); + return dim; +} + +inline void iterate(int64_t /*size*/){}; + +template +inline void iterate(int64_t size, Arg& iter, Args&... iter_tail) { + iter.counter_[iter.dim_ - 1] += size; + iter.data_ = iter.data_ + size * iter.strides_[iter.dim_ - 1]; + iterate(size, iter_tail...); +} + +inline bool iterate_continue() { + return true; +}; + +template +inline bool iterate_continue(Arg& iter, Args&... iter_tail) { + return iter.counter_[iter.dim_ - 1] < iter.sizes_[iter.dim_ - 1] && + iterate_continue(iter_tail...); +} + +inline int64_t max_iterate_size() { + return std::numeric_limits::max(); +}; + +template +inline int64_t max_iterate_size(Arg& iter, Args&... iter_tail) { + return std::min( + (iter.sizes_[iter.dim_ - 1] - iter.counter_[iter.dim_ - 1]), + max_iterate_size(iter_tail...)); +} + +inline void iterate_overflow(){}; + +template +inline void iterate_overflow(Arg& iter, Args&... iter_tail) { + if (iter.counter_[iter.dim_ - 1] == iter.sizes_[iter.dim_ - 1]) { + for (int64_t i = iter.dim_ - 1; i > 0; i--) { + if (iter.counter_[i] == iter.sizes_[i]) { + iter.counter_[i] = 0; + iter.counter_[i - 1]++; + iter.data_ = iter.data_ - (iter.sizes_[i] * iter.strides_[i]) + + iter.strides_[i - 1]; + } + } + } + iterate_overflow(iter_tail...); +} + +inline void forward(int64_t /*offset*/){}; + +template +inline void forward(int64_t offset, Arg& iter, Args&... iter_tail) { + int64_t multi = offset; + for (int64_t i = iter.dim_ - 1; i >= 0; i--) { + int64_t inc = multi % iter.sizes_[i]; + multi = multi / iter.sizes_[i]; + iter.data_ = iter.data_ + inc * iter.strides_[i]; + iter.counter_[i] += inc; + } + forward(offset, iter_tail...); +} + +inline int64_t max_dim() { + return 0; +} + +template +inline int64_t max_dim(Arg& iter, Args&... iter_tail) { + return std::max(iter.dim_, max_dim(iter_tail...)); +} + +inline void apply_op(){}; + +template +inline void apply_op( + int64_t numel, + int64_t offset, + const Op& op, + Args... iters) { + // For 0-dim tensors + if (numel == 1 && max_dim(iters...) == 0) { + op(*iters.data_...); + return; + } + if (offset > 0) + forward(offset, iters...); + // Splitting this into chunks helps the compiler create faster assembly + for (int64_t i = 0; i < numel;) { + for (; iterate_continue(iters...) && i < numel;) { + op(*iters.data_...); + iterate(1, iters...); + i++; + } + iterate_overflow(iters...); + } +} + +/* + Apply a pointwise operator to sequence of tensors + + The calling convention for op is a function/functor that takes the same + number of pointers of type scalar as the number of given tensors. For example, + to compute a = b * c, op would be of the form: + [](scalar* a_val, const scalar* b_val, const scalar* c_val) { a_val[0] = + b_val[0] * c_val[0]; }; +*/ + +template +inline void CPU_tensor_apply2(Tensor tensor1, Tensor tensor2, const Op op) { + if (!_apply_preamble({tensor1, tensor2})) + return; + if (_max_dim_tensors({tensor1, tensor2}) <= 8) { + apply_op( + tensor1.numel(), + 0, + op, + strided_tensor_iter_fixed(tensor1), + strided_tensor_iter_fixed(tensor2)); + } else { + apply_op( + tensor1.numel(), + 0, + op, + strided_tensor_iter(tensor1), + strided_tensor_iter(tensor2)); + } +} + +template +inline void CPU_tensor_apply3( + Tensor tensor1, + Tensor tensor2, + Tensor tensor3, + const Op op) { + if (!_apply_preamble({tensor1, tensor2, tensor3})) + return; + if (_max_dim_tensors({tensor1, tensor2, tensor3}) <= 8) { + apply_op( + tensor1.numel(), + 0, + op, + strided_tensor_iter_fixed(tensor1), + strided_tensor_iter_fixed(tensor2), + strided_tensor_iter_fixed(tensor3)); + } else { + apply_op( + tensor1.numel(), + 0, + op, + strided_tensor_iter(tensor1), + strided_tensor_iter(tensor2), + strided_tensor_iter(tensor3)); + } +} + +template < + typename scalar1, + typename scalar2, + typename scalar3, + typename scalar4, + typename Op> +inline void CPU_tensor_apply4( + Tensor tensor1, + Tensor tensor2, + Tensor tensor3, + Tensor tensor4, + const Op op) { + if (!_apply_preamble({tensor1, tensor2, tensor3, tensor4})) + return; + if (_max_dim_tensors({tensor1, tensor2, tensor3, tensor4}) <= 8) { + apply_op( + tensor1.numel(), + 0, + op, + strided_tensor_iter_fixed(tensor1), + strided_tensor_iter_fixed(tensor2), + strided_tensor_iter_fixed(tensor3), + strided_tensor_iter_fixed(tensor4)); + } else { + apply_op( + tensor1.numel(), + 0, + op, + strided_tensor_iter(tensor1), + strided_tensor_iter(tensor2), + strided_tensor_iter(tensor3), + strided_tensor_iter(tensor4)); + } +} + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CPUFunctions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CPUFunctions.h new file mode 100644 index 0000000000000000000000000000000000000000..17c4ddd92f1d469abb771ed0392eed0df0508b1a --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CPUFunctions.h @@ -0,0 +1,29 @@ +#include + +// TODO Undo all logic introduced for Note [Avoiding Include Cycles In Static Dispatch] +// Code introduced to avoid cyclic dependency in static dispatch is no longer +// needed as static dispatch logic is moved from TensorBody.h, which caused cycles in the first place, +// to Operators.cpp for supporting multiple backends with multiple kernels. +// +// Note [Avoiding Include Cycles In Static Dispatch] +// In order to avoid #include cycles in the static dispatch build, we've carefully split out +// the static function definition files into {DispatchKey}Functions.h and {DispatchKey}Functions_inl.h. +// +// Without this split, the include cycle looks like TensorBody.h -> CPUFunctions.h -> TensorBody.h. +// - TensorBody.h #includes CPUFunctions.h in the static dispatch build, because the tensor methods +// all need to call into the fastpath C++ API defined in CPUFunctions.h. The methods are also all +// directly inlined into TensorBody.h. +// - CPUFunctions.h #includes TensorBody.h because it contains function declarations for the entire C++ API, +// which include functions that have defaultable optional arguments. +// That requires knowing the full Tensor class definition. +// +// We break the cycle by doing the following: +// - Split out CPUFunction.h into two files: CPUFunctions.h and CPUFunctions_inl.h +// - CPUFunction.h is a dummy file that just includes the Tensor class and includes CPUFunctions_inl., +// - CPUFunctions_inl.h includes everything else +// - (only in the static dispatch build) TensorBody.h makes sure to finish defining the Tensor class, +// and then it includes CPUFunctions_inl.h. +// - All other files that want the cpu fastpath functions can include CPUFunctions.h directly. +// - This also means that static dispatch build, CPUFunctions.h only needs to +// #include TensorBody.h, and it will automatically bring in CPUFunctions_inl.h. +#include diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CompositeExplicitAutogradFunctions_inl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CompositeExplicitAutogradFunctions_inl.h new file mode 100644 index 0000000000000000000000000000000000000000..e7d79fc715d6209920c6f3d4a2d02c7d8077b6bd --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CompositeExplicitAutogradFunctions_inl.h @@ -0,0 +1,542 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunctions_inl.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS) +#error This change adds a dependency on all pytorch operators, meaning the \ + file will need to be re-compiled every time an operator is changed or added. \ + Consider including a specific operator from \ + . \ + See NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS]. +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Context.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Context.h new file mode 100644 index 0000000000000000000000000000000000000000..931cd86e77d984cc3b69aca0516b7c3489320825 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Context.h @@ -0,0 +1,560 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace at { + +class Tensor; + +enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM }; + +class TORCH_API Context { + public: + Context(); + + const Generator& defaultGenerator(Device device) { + c10::DeviceType device_type = device.type(); + initCUDAIfNeeded(device_type); + initHIPIfNeeded(device_type); + if (device_type == at::kCPU) { + return at::detail::getDefaultCPUGenerator(); + } else if (device_type == at::kCUDA) { + return at::detail::getCUDAHooks().getDefaultCUDAGenerator(device.index()); + } else if (device_type == at::kMPS) { + return at::detail::getMPSHooks().getDefaultMPSGenerator(); + } else if (device_type == at::kXPU) { + return at::detail::getXPUHooks().getDefaultXPUGenerator(device.index()); + } else if (device_type == at::kIPU) { + return at::detail::getIPUHooks().getDefaultIPUGenerator(device.index()); + } else if (device_type == at::kPrivateUse1) { + return at::GetPrivateUse1HooksInterface()->getDefaultGenerator( + device.index()); + } else { + AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled."); + } + } + const AcceleratorHooksInterface& getAcceleratorHooksInterface( + c10::optional opt_device_type = c10::nullopt) { + c10::DeviceType device_type = opt_device_type.has_value() + ? opt_device_type.value() + : at::getAccelerator(true).value(); + if (device_type == at::kCUDA) { + return at::detail::getCUDAHooks(); + } else if (device_type == at::kMPS) { + return at::detail::getMPSHooks(); + } else if (device_type == at::kPrivateUse1) { + return at::detail::getPrivateUse1Hooks(); + } else { + AT_ERROR( + c10::DeviceTypeName(device_type), " device type not an accelerator."); + } + } + Device getDeviceFromPtr(void* data, c10::DeviceType device_type) { + initCUDAIfNeeded(device_type); + initHIPIfNeeded(device_type); + initXPUIfNeeded(device_type); + if (device_type == at::kCPU) { + return c10::DeviceType::CPU; + } else if (device_type == at::kCUDA) { + return at::detail::getCUDAHooks().getDeviceFromPtr(data); + } else if (device_type == at::kXPU) { + return at::detail::getXPUHooks().getDeviceFromPtr(data); + } else if (device_type == at::kPrivateUse1) { + return at::GetPrivateUse1HooksInterface()->getDeviceFromPtr(data); + } else { + AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled."); + } + } + static bool isPinnedPtr(const void* data) { + return detail::getCUDAHooks().isPinnedPtr(data); + } + static bool hasOpenMP(); + static bool hasMKL(); + static bool hasLAPACK(); + static bool hasMKLDNN(); + static bool hasMAGMA() { + return detail::getCUDAHooks().hasMAGMA(); + } + static bool hasCUDA() { + return detail::getCUDAHooks().hasCUDA(); + } + static bool hasMTIA() { + return detail::getMTIAHooks().hasMTIA(); + } + static bool hasCUDART() { + return detail::getCUDAHooks().hasCUDART(); + } + static long versionCUDART() { + return detail::getCUDAHooks().versionCUDART(); + } + static bool hasCuDNN() { + return detail::getCUDAHooks().hasCuDNN(); + } + static long versionCuDNN() { + return detail::getCUDAHooks().versionCuDNN(); + } + static bool hasCuSOLVER() { + return detail::getCUDAHooks().hasCuSOLVER(); + } + static bool hasHIP() { + return detail::getHIPHooks().hasHIP(); + } + static bool hasMPS() { + return detail::getMPSHooks().hasMPS(); + } + static bool hasIPU() { + return c10::impl::hasDeviceGuardImpl(c10::DeviceType::IPU); + } + static bool hasXLA() { + return c10::impl::hasDeviceGuardImpl(c10::DeviceType::XLA); + } + static bool hasXPU() { + return detail::getXPUHooks().hasXPU(); + } + static bool hasLazy() { + return c10::impl::hasDeviceGuardImpl(c10::DeviceType::Lazy); + } + static bool hasORT() { + return c10::impl::hasDeviceGuardImpl(c10::DeviceType::ORT); + } + // defined in header so that getNonVariableType has ability to inline + // call_once check. getNonVariableType is called fairly frequently + void lazyInitCUDA() { + c10::call_once(thc_init, [&] { detail::getCUDAHooks().initCUDA(); }); + } + void lazyInitHIP() { + c10::call_once(thh_init, [&] { detail::getHIPHooks().initHIP(); }); + } + void lazyInitXPU() { + c10::call_once(thx_init, [&] { detail::getXPUHooks().initXPU(); }); + } + void lazyInitPrivateUse1() { + c10::call_once(thp_init, [&] { + if (isPrivateUse1HooksRegistered()) { + at::GetPrivateUse1HooksInterface()->initPrivateUse1(); + } + }); + } + static const at::cuda::NVRTC& getNVRTC() { + return detail::getCUDAHooks().nvrtc(); + } + + static bool setFlushDenormal(bool on); + + // NB: This method is *purely* whether or not a user requested + // that CuDNN was enabled, it doesn't actually say anything about + // whether or not CuDNN is actually usable. Use cudnn_is_acceptable + // to test this instead + bool userEnabledCuDNN() const; + void setUserEnabledCuDNN(bool e); + bool userEnabledMkldnn() const; + void setUserEnabledMkldnn(bool e); + bool benchmarkCuDNN() const; + void setBenchmarkCuDNN(bool); + int benchmarkLimitCuDNN() const; + void setBenchmarkLimitCuDNN(int); + bool deterministicCuDNN() const; + void setDeterministicCuDNN(bool); + bool userEnabledNNPACK() const; + void setUserEnabledNNPACK(bool e); + + // Note [Disabling Fused SDP Kernels] + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Flash and Memory Efficient SDP kernels are enabled by default. + // However, they can be disabled by setting + // at::globalContext().setUserEnabledFlashSDP(false) flag. + // This is useful for debugging purposes. For example, if you want to + // compare the performance of the flash SDP kernels with the unfused + // kernel, you can disable the flash SDP kernels. By disabling + // the math SDP kernel, you can force your code to use flash kernels. + // The math SDP kernel can be disabled by setting + // at::globalContext().setUserEnabledMathSDP(false) flag. + void setSDPUseFlash(bool); + bool userEnabledFlashSDP() const; + + void setSDPUseMemEfficient(bool); + bool userEnabledMemEfficientSDP() const; + + void setSDPUseMath(bool); + bool userEnabledMathSDP() const; + + void setSDPUseCuDNN(bool); + bool userEnabledCuDNNSDP() const; + + at::LinalgBackend linalgPreferredBackend() const; + void setLinalgPreferredBackend(at::LinalgBackend); + + // Note [Enabling Deterministic Operations] + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Operations in PyTorch that normally act nondeterministically, but have an + // alternate deterministic implementation, should satisfy the following + // requirements: + // + // * Include this comment: "See Note [Enabling Deterministic Operations]" + // + // * Check the value of `at::globalContext().deterministicAlgorithms()` to + // toggle + // between nondeterministic and deterministic implementations. + // + // * Have an entry in the list of PyTorch operations that toggle between + // nondeterministic + // and deterministic implementations, in the docstring of + // `use_deterministic_algorithms()` in torch/__init__.py + // + // `example_func()` below shows an example of toggling between + // nondeterministic and deterministic implementations: + // + // void example_func() { + // // See Note [Enabling Deterministic Operations] + // if (at::globalContext().deterministicAlgorithms()) { + // example_func_deterministic(); + // } else { + // example_func_nondeterministic(); + // } + // } + + bool deterministicAlgorithms() const; + bool deterministicAlgorithmsWarnOnly() const; + void setDeterministicAlgorithms(bool, bool); + bool deterministicFillUninitializedMemory() const; + void setDeterministicFillUninitializedMemory(bool); + + // Note [Writing Nondeterministic Operations] + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Operations in PyTorch that act nondeterministically and do not have an + // alternate deterministic implementation should satisfy the following + // requirements: + // + // * Include this comment: "See Note [Writing Nondeterministic Operations]" + // + // * Include a comment explaining why the operation is nondeterministic. + // + // * Throw an error when `Context::deterministicAlgorithms()` is true. Most + // of the time, this should be accomplished by calling + // `at::globalContext().alertNotDeterminstic()`. However, if the + // nondeterministic behavior is caused by the CuBLAS workspace + // configuration in CUDA >= 10.2, + // `at::globalContext().alertCuBLASConfigNotDeterministic()` should be + // called instead (in this case, a comment explaining why the operation is + // nondeterministic is not necessary). See below for details on these + // methods. + // + // * Have an entry in the list of nondeterministic PyTorch operations in the + // docstring of `use_deterministic_algorithms()` in torch/__init__.py + // + // * Have a test function in `test/test_torch.py` whose name begins with + // `test_nondeterministic_alert_`. Alternatively, if CuBLAS workspace + // configuration is the reason for nondeterminism, the operation should be + // included in the `test_cublas_config_nondeterministic_alert` test. Any new + // tests should ideally follow a pattern similar to the existing ones. + // + // `example_func()` below shows an example of the comments and error-throwing + // code for a nondeterministic operation: + // + // void example_func() { + // // See Note [Writing Nondeterministic Operations] + // // Nondeterministic because + // at::globalContext().alertNondeterministic("example_func"); + // ... + // } + + // Throws an error if `Context::deterministicAlgorithms()` is true + static void alertNotDeterministic(c10::string_view const& caller); + + // Throws an error if `Context::deterministicAlgorithms()` is true, CUDA + // >= 10.2, and CUBLAS_WORKSPACE_CONFIG is not set to either ":16:8" or + // ":4096:8". For more details: + // https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility + void alertCuBLASConfigNotDeterministic() const; + + void setFloat32MatmulPrecision(const std::string& s); + bool allowTF32CuDNN() const; + void setAllowTF32CuDNN(bool); + bool allowTF32CuBLAS() const; + void setAllowTF32CuBLAS(bool); + Float32MatmulPrecision float32MatmulPrecision() const; + void setFloat32MatmulPrecision(Float32MatmulPrecision p); + bool allowFP16ReductionCuBLAS() const; + void setAllowFP16ReductionCuBLAS(bool); + bool allowBF16ReductionCuBLAS() const; + void setAllowBF16ReductionCuBLAS(bool); + at::QEngine qEngine() const; + void setQEngine(at::QEngine e); + static const std::vector& supportedQEngines(); + static bool isXNNPACKAvailable(); + void setCheckSparseTensorInvariants(bool e); + bool checkSparseTensorInvariants() const; + // This method is used to release the original weight after pre-packing. + // It should be called once before loading/running the model. + // NB: By default it is set to true for mobile builds. + void setReleaseWeightsWhenPrepacking(bool e); + bool releaseWeightsWhenPrepacking() const; + + void setDisplayVmapFallbackWarnings(bool enabled); + bool areVmapFallbackWarningsEnabled() const; + + void setDefaultMobileCPUAllocator(); + void unsetDefaultMobileCPUAllocator(); + bool allowFP16ReductionCPU() const; + void setAllowFP16ReductionCPU(bool); + + private: + void initCUDAIfNeeded(c10::DeviceType p) { + if (p == c10::DeviceType::CUDA) { + lazyInitCUDA(); + } + } + void initHIPIfNeeded(c10::DeviceType p) { + if (p == c10::DeviceType::HIP) { + lazyInitHIP(); + } + } + void initXPUIfNeeded(c10::DeviceType p) { + if (p == c10::DeviceType::XPU) { + lazyInitXPU(); + } + } + static bool checkCuBLASConfigDeterministic(); + c10::once_flag thc_init; + c10::once_flag thh_init; + c10::once_flag thx_init; + c10::once_flag thp_init; + bool enabled_cudnn = true; + bool deterministic_cudnn = false; + bool _deterministic_algorithms = false; + bool _deterministic_algorithms_warn_only = false; + bool _deterministic_fill_uninitialized_memory = true; + bool enabled_flashSDP = true; + bool enabled_mem_efficientSDP = true; + bool enabled_mathSDP = true; + bool enabled_cudnnSDP = false; +#ifdef USE_ROCM + bool benchmark_cudnn = true; +#else + bool benchmark_cudnn = false; +#endif + Float32MatmulPrecision float32_matmul_precision = + c10::utils::check_env("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") == true + ? at::Float32MatmulPrecision::HIGH + : at::Float32MatmulPrecision::HIGHEST; + int benchmark_limit_cudnn = 10; + bool allow_tf32_cudnn = true; + bool allow_fp16_reduction_cublas = true; + bool allow_bf16_reduction_cublas = true; + bool enabled_mkldnn = true; + bool enabled_nnpack = true; + at::LinalgBackend linalg_preferred_backend = + c10::utils::check_env("TORCH_LINALG_PREFER_CUSOLVER") == true + ? at::LinalgBackend::Cusolver + : at::LinalgBackend::Default; +#ifdef C10_MOBILE + bool release_original_weights = true; +#else + bool release_original_weights = false; +#endif + bool display_vmap_fallback_warnings_ = false; + c10::optional quantized_engine = c10::nullopt; + bool enable_sparse_tensor_invariant_checks = false; + bool allow_fp16_reduction_cpu = false; + + Allocator* prev_allocator_ptr_{nullptr}; +}; + +TORCH_API Context& globalContext(); + +static inline void init() { + globalContext(); +} + +TORCH_API Allocator* getCPUAllocator(); + +static inline DeprecatedTypeProperties& getDeprecatedTypeProperties( + Backend p, + ScalarType s) { + return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties( + p, s); +} + +static inline DeprecatedTypeProperties& CPU(ScalarType s) { + return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties( + Backend::CPU, s); +} + +static inline DeprecatedTypeProperties& CUDA(ScalarType s) { + return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties( + Backend::CUDA, s); +} + +static inline DeprecatedTypeProperties& HIP(ScalarType s) { + return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties( + Backend::HIP, s); +} + +static inline DeprecatedTypeProperties& MPS(ScalarType s) { + return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties( + Backend::MPS, s); +} + +static inline bool hasCUDA() { + return globalContext().hasCUDA(); +} + +static inline bool hasMTIA() { + return globalContext().hasMTIA(); +} + +static inline bool hasHIP() { + return globalContext().hasHIP(); +} + +static inline bool hasIPU() { + return globalContext().hasIPU(); +} + +static inline bool hasXLA() { + return globalContext().hasXLA(); +} + +static inline bool hasMPS() { + return globalContext().hasMPS(); +} + +static inline bool hasORT() { + return globalContext().hasORT(); +} + +static inline bool hasXPU() { + return globalContext().hasXPU(); +} + +// Despite its name, this function returns the number of *CUDA* GPUs. +static inline size_t getNumGPUs() { + // WARNING: DO NOT ADD LOGIC TO HANDLE OTHER DEVICE TYPES TO THIS + // FUNCTION. If you are interested in interrogating the number of + // devices for a specific device type, add that function to the + // relevant library (e.g., similar to at::cuda::device_count()) + if (hasCUDA() && hasHIP()) { + throw std::runtime_error( + "Enabling both CUDA and HIP in ATen is not supported, as HIP masquerades " + "to be CUDA (e.g., when you say CUDA, on a HIP build of ATen, this actually " + "means HIP. Rebuild PyTorch with one or the other disabled."); + } else if (hasCUDA()) { + return detail::getCUDAHooks().getNumGPUs(); + } else if (hasHIP()) { + return detail::getHIPHooks().getNumGPUs(); + } else { + return 0; + } +} + +static inline bool hasOpenMP() { + return globalContext().hasOpenMP(); +} + +static inline bool hasMKL() { + return globalContext().hasMKL(); +} + +static inline bool hasLAPACK() { + return globalContext().hasLAPACK(); +} + +static inline bool hasMAGMA() { + return globalContext().hasMAGMA(); +} + +static inline bool hasMKLDNN() { + return globalContext().hasMKLDNN(); +} + +static inline void manual_seed(uint64_t seed) { + auto gen = globalContext().defaultGenerator(c10::DeviceType::CPU); + { + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen.mutex()); + gen.set_current_seed(seed); + } + // NB: Sometimes we build with CUDA, but we don't have any GPUs + // available. In that case, we must not seed CUDA; it will fail! + const auto cuda_num_gpus = detail::getCUDAHooks().getNumGPUs(); + if (hasCUDA() && cuda_num_gpus > 0) { + for (const auto i : c10::irange(cuda_num_gpus)) { + auto cuda_gen = globalContext().defaultGenerator( + Device(at::kCUDA, static_cast(i))); + { + // See Note [Acquire lock when using random generators] + std::lock_guard lock(cuda_gen.mutex()); + cuda_gen.set_current_seed(seed); + } + } + } + + const auto xpu_num_gpus = detail::getXPUHooks().getNumGPUs(); + if (hasXPU() && xpu_num_gpus) { + for (const auto i : c10::irange(xpu_num_gpus)) { + auto xpu_gen = globalContext().defaultGenerator( + Device(at::kXPU, static_cast(i))); + { + // See Note [Acquire lock when using random generators] + std::lock_guard lock(xpu_gen.mutex()); + xpu_gen.set_current_seed(seed); + } + } + } + + if (hasMPS()) { + auto mps_gen = globalContext().defaultGenerator(c10::DeviceType::MPS); + // See Note [Acquire lock when using random generators] + std::lock_guard lock(mps_gen.mutex()); + mps_gen.set_current_seed(seed); + } +} + +// When the global flag `allow_tf32` is set to true, cuBLAS handles are +// automatically configured to use math mode CUBLAS_TF32_TENSOR_OP_MATH. +// For some operators, such as addmv, TF32 offers no performance improvement +// but causes precision loss. To help this case, this class implements +// a RAII guard that can be used to quickly disable TF32 within its scope. +// +// Usage: +// NoTF32Guard disable_tf32; +struct TORCH_API NoTF32Guard { + NoTF32Guard(); + ~NoTF32Guard(); + static bool should_disable_tf32(); + + private: + bool changed = false; +}; + +struct TORCH_API ROCmBackwardPassGuard { + ROCmBackwardPassGuard(); + ~ROCmBackwardPassGuard(); + static bool is_backward_pass(); +}; + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/DeviceAccelerator.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/DeviceAccelerator.h new file mode 100644 index 0000000000000000000000000000000000000000..c3e800c7e07c65c4289baa46ba29d9b61cc5dd20 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/DeviceAccelerator.h @@ -0,0 +1,27 @@ +#pragma once + +#include +#include + +#include +#include + +// This file defines the top level Accelerator concept for PyTorch. +// A device is an accelerator per the definition here if: +// - It is mutually exclusive with all other accelerators +// - It performs asynchronous compute via a Stream/Event system +// - It provides a set of common APIs as defined by AcceleratorHooksInterface +// +// As of today, accelerator devices are (in no particular order): +// CUDA, MTIA, PrivateUse1 +// We want to add once all the proper APIs are supported and tested: +// HIP, MPS, XPU + +namespace at { + +// Ensures that only one accelerator is available (at +// compile time if possible) and return it. +// When checked is true, the returned optional always has a value. +TORCH_API std::optional getAccelerator(bool checked = false); + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/InferSize.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/InferSize.h new file mode 100644 index 0000000000000000000000000000000000000000..111c7eb8f5fc7cd20a3eb812450324788608c011 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/InferSize.h @@ -0,0 +1,87 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace at { + +// Infers the size of a dim with size -1, if it exists. Also checks that new +// shape is compatible with the number of elements. +// +// templated to handle std::vector and DimVector use cases, see +// below +// +template +inline void infer_size_impl( + InputArrayRef shape, + NumelType numel, + ResultVec& res) { + NumelType newsize = 1; + // N.B. this is an index, not a sym dim! + auto infer_dim = c10::optional(); + for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) { + if (shape[dim] == -1) { + if (infer_dim) { + throw std::runtime_error("only one dimension can be inferred"); + } + infer_dim = dim; + } else if (shape[dim] >= 0) { + newsize *= shape[dim]; + } else { + AT_ERROR("invalid shape dimension ", shape[dim]); + } + } + + if (numel == newsize || (infer_dim && newsize > 0 && numel % newsize == 0)) { + if (infer_dim) { + // We have a degree of freedom here to select the dimension size; follow + // NumPy semantics and just bail. However, a nice error message is needed + // because users often use `view` as a way to flatten & unflatten + // dimensions and will otherwise be confused why + // empty_tensor.view( 0, 0) + // works yet + // empty_tensor.view(-1, 0) + // doesn't. + TORCH_CHECK( + newsize != 0, + "cannot reshape tensor of 0 elements into shape ", + shape, + " because the unspecified dimension size -1 can be any " + "value and is ambiguous"); + res[*infer_dim] = numel / newsize; + } + return; + } + + std::ostringstream ss; + ss << "shape '" << shape << "' is invalid for input of size " << numel; + throw std::runtime_error(ss.str()); +} + +inline std::vector infer_size(IntArrayRef shape, int64_t numel) { + auto res = shape.vec(); + infer_size_impl(shape, numel, res); + return res; +} + +inline at::DimVector infer_size_dv(IntArrayRef shape, int64_t numel) { + auto res = at::DimVector(shape); + infer_size_impl(shape, numel, res); + return res; +} + +inline at::SymDimVector infer_size_dv( + c10::SymIntArrayRef shape, + c10::SymInt numel) { + auto res = at::SymDimVector(shape); + infer_size_impl( + shape, std::move(numel), res); + return res; +} + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ParallelFuture.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ParallelFuture.h new file mode 100644 index 0000000000000000000000000000000000000000..042cd92da19345d7523671ca75da7279d13062a9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ParallelFuture.h @@ -0,0 +1,13 @@ +#pragma once + +#include +#include +#include + +namespace at { + +// Launches intra-op parallel task, returns a future +TORCH_API c10::intrusive_ptr intraop_launch_future( + std::function func); + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Scalar.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Scalar.h new file mode 100644 index 0000000000000000000000000000000000000000..e12557428f15674e4382983c07de64c3e43e8af0 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Scalar.h @@ -0,0 +1,3 @@ +#pragma once + +#include diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ScalarType.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ScalarType.h new file mode 100644 index 0000000000000000000000000000000000000000..2181250740e23808f06e63660f50ca887169bcb1 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ScalarType.h @@ -0,0 +1,4 @@ +#pragma once +#include // for BC reasons +#include +#include diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/SparseCsrTensorImpl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/SparseCsrTensorImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..515ddc6e7e18d9e11b391ec10ddcb47f6c9838d8 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/SparseCsrTensorImpl.h @@ -0,0 +1,186 @@ +#pragma once + +#include +#include +#include +namespace at { + +// Struct implementing a sparse CSR tensor. It uses three 1-D tensors for +// denoting the data: `crow_indices_`, `col_indices_` and `values_`. +// The `crow_indices_` tensor is a integer tensor of shape `(size(0) + 1)` +// that represents the compressed row indices of the CSR tensor. The +// `col_indices_` tensor is an integer tensor of shape `(nnz())` +// that explicitly stores the column indices of each value of the sparse +// tensor. The `values_` tensor can be of any pytorch-supported data type +// and has shape `(nnz())`. +// +// Since the main advantage of the CSR format over the COO format is speed of +// computation, care must be taken to facilitate smooth interfacing of +// these data structures with optimized libraries such as MKL and MAGMA. +// Since the MKL interface for pytorch currently uses indexing with int32 +// type, it is important to make sure that the `crow_indices` and `col_indices` +// are of type int32 when calling MKL routines such as SPMM or SPMV. +// +// If not calling MKL, it should be alright to use 64 bit integer tensors +// for indexing. +struct TORCH_API SparseCsrTensorImpl : public TensorImpl { + Tensor crow_indices_; + Tensor col_indices_; + Tensor values_; + Layout layout_; + + public: + explicit SparseCsrTensorImpl( + at::DispatchKeySet, + at::Device device, + Layout layout, + const caffe2::TypeMeta); + + void resize_(int64_t nnz, IntArrayRef size); + void resize_and_clear_( + int64_t sparse_dim, + int64_t dense_dim, + IntArrayRef size); + void resize_as_sparse_compressed_tensor_(const Tensor& src); + void set_member_tensors( + const Tensor& crow_indices, + const Tensor& col_indices, + const Tensor& values, + c10::SymIntArrayRef size); + void set_member_tensors( + const Tensor& crow_indices, + const Tensor& col_indices, + const Tensor& values, + IntArrayRef size); + const Tensor& compressed_indices() const { + return crow_indices_; + } + const Tensor& plain_indices() const { + return col_indices_; + } + const Tensor& values() const { + return values_; + } + int64_t nnz() { + return col_indices_.size(-1); + } + + inline int64_t batch_dim() const noexcept { + return crow_indices_.dim() - 1; + } + + inline int64_t sparse_dim() const noexcept { + return 2; + } + + inline int64_t dense_dim() const noexcept { + return values_.dim() - batch_dim() - block_dim() - 1; + } + + private: + inline int64_t block_dim() const noexcept { + return (layout_ == kSparseBsr || layout_ == kSparseBsc ? 2 : 0); + } + + protected: + IntArrayRef strides_custom() const override; + SymIntArrayRef sym_strides_custom() const override; + bool is_contiguous_custom(MemoryFormat) const override; + + public: + void set_size(int64_t dim, int64_t new_size) override; + void set_stride(int64_t dim, int64_t new_stride) override; + void set_storage_offset(int64_t storage_offset) override; + Layout layout_impl() const override { + return layout_; + } + void set_layout(Layout layout) { + switch (layout) { + case kSparseCsr: + case kSparseCsc: + case kSparseBsr: + case kSparseBsc: + layout_ = layout; + break; + default: + TORCH_CHECK(false, "unsupported layout ", layout); + } + } + + /** + * Return a TensorImpl that is a shallow-copy of this TensorImpl. + * + * For usage of `version_counter` and `allow_tensor_metadata_change`, + * see NOTE [ TensorImpl Shallow-Copying ]. + */ + c10::intrusive_ptr shallow_copy_and_detach( + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change) const override { + auto impl = c10::make_intrusive( + key_set(), device(), layout_impl(), dtype()); + copy_tensor_metadata( + /*src_sparse_impl=*/this, + /*dest_sparse_impl=*/impl.get(), + /*version_counter=*/version_counter, + /*allow_tensor_metadata_change=*/allow_tensor_metadata_change); + impl->refresh_numel(); + return impl; + } + + /** + * Return a TensorImpl that is a shallow-copy of this TensorImpl. + * + * For usage of `version_counter` and `allow_tensor_metadata_change`, + * see NOTE [ TensorImpl Shallow-Copying ]. + */ + c10::intrusive_ptr shallow_copy_and_detach( + c10::VariableVersion&& version_counter, + bool allow_tensor_metadata_change) const override { + auto impl = c10::make_intrusive( + key_set(), device(), layout_impl(), dtype()); + copy_tensor_metadata( + /*src_sparse_impl=*/this, + /*dest_sparse_impl=*/impl.get(), + /*version_counter=*/std::move(version_counter), + /*allow_tensor_metadata_change=*/allow_tensor_metadata_change); + impl->refresh_numel(); + return impl; + } + + private: + explicit SparseCsrTensorImpl( + at::DispatchKeySet key_set, + const caffe2::TypeMeta data_type, + at::Tensor crow_indices, + at::Tensor col_indices, + at::Tensor values, + at::Layout layout); + + const char* tensorimpl_type_name() const override; + + /** + * Copy the tensor metadata fields (e.g. sizes / strides / storage pointer / + * storage_offset) from one TensorImpl to another TensorImpl. + * + * For usage of `version_counter` and `allow_tensor_metadata_change`, see NOTE + * [ TensorImpl Shallow-Copying ]. + */ + static void copy_tensor_metadata( + const SparseCsrTensorImpl* src_sparse_impl, + SparseCsrTensorImpl* dest_sparse_impl, + c10::VariableVersion version_counter, + bool allow_tensor_metadata_change) { + TensorImpl::copy_tensor_metadata( + src_sparse_impl, + dest_sparse_impl, + std::move(version_counter), + allow_tensor_metadata_change); + + // Sparse-specific fields + dest_sparse_impl->crow_indices_ = src_sparse_impl->compressed_indices(); + dest_sparse_impl->col_indices_ = src_sparse_impl->plain_indices(); + dest_sparse_impl->values_ = src_sparse_impl->values(); + dest_sparse_impl->layout_ = src_sparse_impl->layout_impl(); + } +}; +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Tensor.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..0b3719cca3bf1ff7154625c510c8292dd47444a7 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Tensor.h @@ -0,0 +1,3 @@ +#pragma once + +#include diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorIterator.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorIterator.h new file mode 100644 index 0000000000000000000000000000000000000000..4a1a2bce60808a67629bcd53492413017cd1d429 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorIterator.h @@ -0,0 +1,1002 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace at { +class Tensor; +class OptionalTensorRef; +using NameVector = SmallVector; +} // namespace at + +// TensorIterator is a helper class for element-wise operations, such as +// arithmetic, comparisons, and trigonometric functions. It handles +// broadcasting and type conversions of operands. +// +// This is inspired by NumPy's Array Iterator API (NpyIter). +// +// The files Loops.h and Loops.cuh provide functions to build kernels that +// use TensorIterator. +// +// Example: +// +// auto iter = TensorIteratorConfig() +// .add_output(output) +// .add_input(input) +// .build() +// +// [MyKernel.cpp / MyKernel.cu] +// cpu_kernel(iter, [](float a, float b) { +// return a + b; +// }); +// +// gpu_kernel(iter, []GPU_LAMBDA(float a, float b) -> float { +// return a + b; +// }); +// +// Note [Order of Construction] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// When setting up the tensor iterator configuration, the output Tensors +// have to be added first via +// TensorIteratorConfig::add_owned_output(at::Tensor). After adding all outputs, +// the inputs can be added via +// TensorIteratorConfig::add_owned_input(at::Tensor). +// Adding another output after inputs have been added will rise an exception. +// +// Note [Common Dtype Computation] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Some operations have a natural notion of a "common dtype" or +// "computation dtype" where all inputs are cast to one dtype, the +// operation is performed, and then the results are cast to all outputs. +// +// TensorIterator infers a common dtype if all inputs have the same dtype, +// and it computes one using type promotion rules on its inputs if +// promote_inputs_to_common_dtype_ is true. Attempting to query +// a common dtype otherwise will throw an exception. +// +// Note that the outputs are not considered when computing a common dtype. + +namespace at { + +namespace internal { +// This parameter is heuristically chosen to determine the minimum number of +// work that warrants parallelism. For example, when summing an array, it is +// deemed inefficient to parallelise over arrays shorter than 32768. Further, +// no parallel algorithm (such as parallel_reduce) should split work into +// smaller than GRAIN_SIZE chunks. +constexpr int64_t GRAIN_SIZE = 32768; + +// Storage for a non-owning Tensor, without needing to include Tensor.h +class TORCH_API OpaqueOptionalTensorRef { + alignas(alignof(TensorBase)) std::array data_{}; + + public: + OpaqueOptionalTensorRef(); + OpaqueOptionalTensorRef(const OpaqueOptionalTensorRef&) = default; + OpaqueOptionalTensorRef& operator=(const OpaqueOptionalTensorRef&) = default; + OpaqueOptionalTensorRef(OpaqueOptionalTensorRef&&) noexcept = default; + OpaqueOptionalTensorRef& operator=(OpaqueOptionalTensorRef&&) noexcept = + default; + ~OpaqueOptionalTensorRef(); + + OptionalTensorRef* get() { + return reinterpret_cast(data_.data()); + } + const OptionalTensorRef* get() const { + return reinterpret_cast(data_.data()); + } + + OptionalTensorRef& operator*() { + return *get(); + } + const OptionalTensorRef& operator*() const { + return *get(); + } + OptionalTensorRef* operator->() { + return get(); + } + const OptionalTensorRef* operator->() const { + return get(); + } + + const Tensor& getTensor() const; +}; +} // namespace internal + +struct TORCH_API OperandInfo { + using StrideVector = SmallVector; + OperandInfo() = default; + C10_ALWAYS_INLINE explicit OperandInfo(c10::MaybeOwned&& t) { + if (t->defined()) { + device = t->device(); + target_dtype = t->scalar_type(); + current_dtype = target_dtype; + } + tensor(std::move(t)); + validate(); + } + + C10_ALWAYS_INLINE OperandInfo(const OperandInfo&) = default; + C10_ALWAYS_INLINE OperandInfo& operator=(const OperandInfo&) = default; + C10_ALWAYS_INLINE OperandInfo(OperandInfo&&) noexcept = default; + C10_ALWAYS_INLINE OperandInfo& operator=(OperandInfo&&) noexcept = default; + C10_ALWAYS_INLINE ~OperandInfo() = default; + + /// The data pointer. This may be different from tensor->data_ptr() if the + /// iterator is split. + void* data = nullptr; + + /// Stride after broadcasting. The stride is in bytes, not number of elements. + StrideVector stride_bytes; + + /// The desired device and type for the operand. For inputs, this specifies + /// that the input should be converted to this type if necessary. For outputs, + /// this specifies which type to allocate. target_dtype and device are + /// initialized with the dtype and device of the tensor but during type + /// promotion target_dtype value can become different from tensor's dtype + /// also, during type promotion target_dtype and device can be set for an + /// undefined tensor so that tensor can be properly constructed later. + c10::optional device = c10::nullopt; + ScalarType target_dtype = ScalarType::Undefined; + // Caches dtype of the tensor, because scalar_type is an expensive operation + // If dtype of the tensor is changed (e.g. as a result of type promotion or in + // allocate_outputs), this + // value should be changed too. + ScalarType current_dtype = ScalarType::Undefined; + + bool is_device_defined() const { + return device.has_value(); + } + bool is_type_defined() const { + return target_dtype != ScalarType::Undefined; + } + TensorOptions options() const { + return TensorOptions(target_dtype).device(device); + } + + bool is_output = false; + + bool will_resize = false; + + bool is_read_write = false; + + bool is_const = false; + + void validate() { + TORCH_CHECK( + !tensor_base_->defined() || tensor_base_->layout() == kStrided, + "unsupported tensor layout: ", + tensor_base_->layout()); + } + + /// The tensor operand. Note that the strides, data pointer, and + /// other attributes may differ due to dimension reordering and + /// coalescing. + const Tensor& tensor() const { + return tensor_storage_.getTensor(); + } + const TensorBase& tensor_base() const { + return *tensor_base_; + } + void tensor(c10::MaybeOwned&& tensor); + + // Save the original tensor operand in cases when an output is modified + // (e.g. if dtype is changed) + const Tensor& original_tensor() const { + return original_tensor_storage_.getTensor(); + } + const TensorBase& original_tensor_base() const { + return *original_tensor_base_; + } + + // Set tensor to a new value, and store the old tensor value in + // original_tensor Should only ever be called once for the lifetime of an + // operand + void exchange_tensor(c10::MaybeOwned&& new_tensor); + + // Move original_tensor back into tensor, exchange_tensor must have been + // called before + void restore_original_tensor(); + + private: + c10::MaybeOwned tensor_base_; + c10::MaybeOwned original_tensor_base_ = + c10::MaybeOwned::owned(std::in_place); + + // We store TensorBase visibly in the header to allow inline access. + // However, we sometimes need a genuine `const Tensor &` for the + // TensorIterator API. So, we also store a non-owning `Tensor` + // object in these `_storage_` variables. + internal::OpaqueOptionalTensorRef tensor_storage_; + internal::OpaqueOptionalTensorRef original_tensor_storage_; +}; + +struct SplitUntil32Bit; + +enum class FastSetupType : uint8_t { + NONE, + CONTIGUOUS, + CHANNELS_LAST, + NON_OVERLAPPING_DENSE +}; + +class TensorIteratorConfig; +struct TensorIterator; + +struct TORCH_API TensorIteratorBase : public impl::MetaBase { + using DimMask = std::bitset<64>; + using PtrVector = SmallVector; + using StrideVector = SmallVector; + + TensorIteratorBase(); + void build(TensorIteratorConfig&); + + // The inner-loop function operates on the fastest moving dimension. It + // implements element-wise operations in terms of 1-d strided tensors. + // + // Arguments: + // data: data pointers for each operand (length `ntensors`) + // strides: stride for each operand (length `ntensors`) + // size: size of inner loop + // + // The `size` often matches shape[0], but may be smaller due to + // parallelization of the inner loop. + using loop2d_t = c10::function_ref< + void(char** data, const int64_t* strides, int64_t size0, int64_t size1)>; + + using loop_subiter_t = c10::function_ref; + + void foreach_reduced_elt(loop_subiter_t loop, bool parallelize = true); + + int ndim() const { + return static_cast(shape_.size()); + } + IntArrayRef shape() const { + return shape_; + } + int64_t numel() const; + int ntensors() const { + return static_cast(operands_.size()); + } + int noutputs() const { + return num_outputs_; + } + int ninputs() const { + return ntensors() - noutputs(); + } + IntArrayRef view_offsets() const { + return view_offsets_; + } + + /// number of elements in the output operand. this is the same as numel() for + /// operations that are not reductions. + int64_t num_output_elements() const; + + /// number of reduced dimensions in a reduction operation + int num_reduce_dims() const; + + /// 1-dimensional iteration and no buffering or type conversion + bool is_trivial_1d() const; + /// Reducible to 1-dimensional and all operands are contiguous + bool is_contiguous() const; + bool is_dim_reduced(int dim) const; + + /// Accessors for each operand + IntArrayRef strides(int64_t arg) const { + return operands_[arg].stride_bytes; + } + void* data_ptr(int64_t arg) const; + ScalarType dtype(int64_t arg = 0) const { + return operands_[arg].current_dtype; + } + ScalarType common_dtype() const { + TORCH_INTERNAL_ASSERT( + common_dtype_ != ScalarType::Undefined, + "Queried for invalid common dtype!"); + return common_dtype_; + } + ScalarType input_dtype(int64_t arg = 0) const { + return operands_[num_outputs_ + arg].current_dtype; + } + Device device(int64_t arg = 0) const { + return operands_[arg].device.value(); + } + c10::DeviceType device_type(int64_t arg = 0) const { + return device(arg).type(); + } + int64_t element_size(int64_t arg) const { + return static_cast(elementSize(dtype(arg))); + } + bool is_scalar(int64_t arg) const; + bool is_cpu_scalar(int64_t arg) const; + + const TensorBase& tensor_base(int64_t arg) const { + return operands_[arg].tensor_base(); + } + const Tensor& tensor(int64_t arg) const { + return operands_[arg].tensor(); + } + + const TensorBase& output_base(int64_t arg = 0) const { + AT_ASSERT(arg < num_outputs_); + return tensor_base(arg); + } + + const Tensor& output(int64_t arg = 0) const { + AT_ASSERT(arg < num_outputs_); + return tensor(arg); + } + + const TensorBase& input_base(int64_t arg = 0) const { + AT_ASSERT(arg >= 0 && arg < ntensors() - num_outputs_); + return tensor_base(num_outputs_ + arg); + } + const Tensor& input(int64_t arg = 0) const { + AT_ASSERT(arg >= 0 && arg < ntensors() - num_outputs_); + return tensor(num_outputs_ + arg); + } + + // Copies from temporary outputs back to the original outputs + // NOTE: only used on CPU + void cast_outputs(); + + /// Removes an operand from this iterator + void remove_operand(int64_t arg); + /// Shrinks an iterated dimension + void narrow(int dim, int64_t start, int64_t size); + /// Narrows every dim after and including `start_dim` to size one. + void select_all_keeping_dim(int start_dim, IntArrayRef starts); + /// Replaces the data pointer for the operand at index `arg`. + /// The new pointer should have the same sizes, strides and dtype as the + /// original + void unsafe_replace_operand(int64_t arg, void* data); + + /// Splits this TensorIterator into two iterators. Together they iterate over + /// the entire operation. Used by `with_32bit_indexing()`. + std::unique_ptr split(int dim); + + /// Returns the dimension with the largest extent: (size[dim]-1) * stride[dim] + int get_dim_to_split() const; + + template + T scalar_value(int64_t arg) { + auto& op = operands_[arg]; + return c10::fetch_and_cast(op.tensor_base().scalar_type(), op.data); + } + + /// Return scalar value from original_tensor_base if it is defined. When + /// common_dtype is Half, casting scalar input to common_dtype might overflow. + /// If the scalar is aleady given in the type of Half, then return scalar + /// value from tensor_base. + template + T original_scalar_value(int64_t arg) { + auto& original_tensor_base = operands_[arg].original_tensor_base(); + if (original_tensor_base.defined()) { + TORCH_INTERNAL_ASSERT( + original_tensor_base.scalar_type() != common_dtype()); + return c10::fetch_and_cast( + original_tensor_base.scalar_type(), + original_tensor_base.const_data_ptr()); + } else { + return scalar_value(arg); + } + } + + private: + template + auto loop_2d_from_1d(const loop1d_t& loop) { + return + [loop, ntensor = ntensors()]( + char** base, const int64_t* strides, int64_t size0, int64_t size1) { + PtrVector data(base, base + ntensor); + const int64_t* outer_strides = &strides[ntensor]; + for (const auto i : c10::irange(size1)) { + if (i > 0) { + for (const auto arg : c10::irange(ntensor)) { + data[arg] += outer_strides[arg]; + } + } + loop(data.data(), strides, size0); + } + }; + } + + public: + template < + typename loop1d_t, + std::enable_if_t< + std::is_convertible_v< + loop1d_t, + c10::function_ref< + void(char**, const int64_t* strides, int64_t size)>>, + int> = 0> + void for_each(loop1d_t loop, int64_t grain_size = at::internal::GRAIN_SIZE) { + for_each(loop_2d_from_1d(loop), grain_size); + } + + void for_each(loop2d_t loop, int64_t grain_size = at::internal::GRAIN_SIZE); + + void parallel_reduce(loop2d_t loop); + + template < + typename loop1d_t, + std::enable_if_t< + std::is_convertible_v< + loop1d_t, + c10::function_ref< + void(char**, const int64_t* strides, int64_t size)>>, + int> = 0> + void serial_for_each(loop1d_t loop, Range range) { + serial_for_each(loop_2d_from_1d(loop), range); + } + + void serial_for_each(loop2d_t loop, Range range) const; + + /// Create a strides array for a Tensor with shape of this iterator. The + /// parameter `element_size` specifies the size of Tensor's data type in + /// bytes (e.g. `4` for `float`) + StrideVector compatible_stride(int64_t element_size) const; + + /// Inverts the re-ordering done by reorder_dimensions. This can only be + /// called *before* coalesce_dimensions() is called. + DimVector invert_perm(IntArrayRef input) const; + + /// Reapply same re-ordering as it is done by reorder_dimensions. This can + /// only be called *before* coalesce_dimensions() is called. + DimVector apply_perm_and_mul(IntArrayRef input, int mul) const; + + /// Helper functions for CPU iteration + StrideVector get_dim_strides(int dim) const; + StrideVector get_strides() const; + StrideVector get_inner_strides() const { + return get_dim_strides(0); + } + PtrVector get_base_ptrs() const; + + // Helper functions for advanced stride manipulations (e.g. torch.flip) + void _unsafe_set_arg_strides(const int64_t arg, IntArrayRef strides) { + operands_[arg].stride_bytes = strides; + } + void _unsafe_set_arg_data(const int64_t arg, void* data) { + operands_[arg].data = data; + } + + /// true if the stride computation can use 32-bit arithmetic. Used by GPU + /// kernels + bool can_use_32bit_indexing() const; + + /// An "iteratable" object that recursively splits this iterator into + /// sub-iterators that can use 32-bit indexing. + SplitUntil32Bit with_32bit_indexing() const; + + /// If the kernel should accumulate into the output. Only relevant for CUDA + /// reductions. + bool should_accumulate() const { + return accumulate_; + } + + /// Whether this iterator produces the actual output, + /// as opposed to something that will be accumulated further. Only relevant + /// for CUDA reductions. + bool is_final_output() const { + return final_output_; + } + + bool has_contiguous_first_dim() const { + if (ndim() == 0) { + return true; + } + + int num_tensors = ntensors(); + for (const auto i : c10::irange(num_tensors)) { + if (strides(i)[0] != element_size(i)) { + return false; + } + } + return true; + } + + void set_output_raw_strided( + int64_t output_idx, + IntArrayRef sizes, + IntArrayRef strides, + TensorOptions options, + DimnameList names) override; + +#define TORCH_DISALLOW_TEMPORARIES_IMPL(methodname, maybestatic) \ + maybestatic void methodname( \ + TensorBase&& out, const TensorBase& a, const TensorBase& b) = delete; \ + maybestatic void methodname( \ + const TensorBase& out, TensorBase&& a, const TensorBase& b) = delete; \ + maybestatic void methodname( \ + const TensorBase& out, const TensorBase& a, TensorBase&& b) = delete; \ + maybestatic void methodname( \ + TensorBase&& out, TensorBase&& a, const TensorBase& b) = delete; \ + maybestatic void methodname( \ + TensorBase&& out, const TensorBase& a, TensorBase&& b) = delete; \ + maybestatic void methodname( \ + const TensorBase& out, TensorBase&& a, TensorBase&& b) = delete; \ + maybestatic void methodname( \ + TensorBase&& out, TensorBase&& a, TensorBase&& b) = delete; + +#define TORCH_DISALLOW_TEMPORARIES(methodname) \ + TORCH_DISALLOW_TEMPORARIES_IMPL(methodname, ) + + void build_binary_float_op( + const TensorBase& out, + const TensorBase& a, + const TensorBase& b); + void build_borrowing_binary_float_op( + const TensorBase& out, + const TensorBase& a, + const TensorBase& b); + TORCH_DISALLOW_TEMPORARIES(build_borrowing_binary_float_op) + void build_binary_op( + const TensorBase& out, + const TensorBase& a, + const TensorBase& b); + void build_borrowing_binary_op( + const TensorBase& out, + const TensorBase& a, + const TensorBase& b); + TORCH_DISALLOW_TEMPORARIES(build_borrowing_binary_op) + void build_unary_float_op(const TensorBase& out, const TensorBase& a); + void build_borrowing_unary_float_op( + const TensorBase& out, + const TensorBase& a); + TORCH_DISALLOW_TEMPORARIES(build_borrowing_unary_float_op) + void build_unary_op(const TensorBase& out, const TensorBase& a); + // Odd special case needed for pow. Has to borrow the output because + // it's a structured kernel, but the argument is potentially a copy. + void build_output_borrowing_argument_owning_unary_op( + const TensorBase& out, + const TensorBase& a); + void build_borrowing_unary_op(const TensorBase& out, const TensorBase& a); + TORCH_DISALLOW_TEMPORARIES(build_borrowing_unary_op) + void build_borrowing_unary_force_boolean_op( + const TensorBase& out, + const TensorBase& a); + TORCH_DISALLOW_TEMPORARIES(build_borrowing_unary_force_boolean_op) + void build_comparison_op( + const TensorBase& out, + const TensorBase& a, + const TensorBase& b); + void build_borrowing_comparison_op( + const TensorBase& out, + const TensorBase& a, + const TensorBase& b); + TORCH_DISALLOW_TEMPORARIES(build_borrowing_comparison_op) + // Another special case: we need to own the second argument for comparison + // ops. + void build_borrowing_except_last_argument_comparison_op( + const TensorBase& out, + const TensorBase& a, + const TensorBase& b); + void build_ternary_op( + const TensorBase& out, + const TensorBase& a, + const TensorBase& b, + const TensorBase& c); + +#undef TORCH_DISALLOW_TEMPORARIES + protected: + // Mutable reference as it moves tensors out of TensorIteratorConfig + void populate_operands(TensorIteratorConfig&); + void mark_outputs(); + void mark_resize_outputs(const TensorIteratorConfig&); + void compute_mem_overlaps(const TensorIteratorConfig&); + void compute_shape(const TensorIteratorConfig&); + void compute_strides(const TensorIteratorConfig&); + void reorder_dimensions(); + void permute_dimensions(IntArrayRef perm); + void compute_types(const TensorIteratorConfig&); + ScalarType compute_common_dtype(); + void allocate_or_resize_outputs(); + bool fast_set_up(const TensorIteratorConfig&); + FastSetupType compute_fast_setup_type(const TensorIteratorConfig&); + void compute_names(const TensorIteratorConfig&); + void propagate_names_to_outputs(); + void coalesce_dimensions(); + + protected: + /// Records the "computation" shape of the output tensor. The computation + /// shape is different from the regular shape in a few ways: + /// + /// - The shape may be permuted (via permute_dimensions) so that we + /// process the dimensions in the most computationally efficient order + /// (rather than the logical order given to us by the users.) + /// - The shape may have adjacent dimensions collapsed (via + /// coalesce_dimensions) so that we minimize the number of + /// dimensions we have to explicitly iterate over. For example, + /// a pointwise operation on a contiguous tensor "computationally" + /// consists of only a single dimension. + /// + /// In other words, the computation shape is the output shape as it + /// actually matters for implementing the kernel, but not necessarily the + /// output shape that the user will see in the end. + /// + /// The lifecycle of mutations to shape_ in TensorIterator: + /// - declare_static_shape() sets an initial shape explicitly + /// provided by user, otherwise + /// - compute_shape() computes the true (non-computational) shape + /// specified by the user. + /// - reorder_dimensions() reorders dimensions to improve coalescing. + /// - coalesce_dimensions() then coalesces adjacent dimensions when + /// possible. + /// + /// The shape may also be further modified if we create sub-TensorIterators, + /// e.g., via narrow or select_all_keeping_dim. + DimVector shape_; + + /// Temporarily records the permutation computed by reorder_dimensions. + /// This permutation maps the computation output dimension (dim) to + /// the original true output dimension (perm_[dim]). It is used by + /// invert_perm to undo the permutation. After coalesce_dimensions is + /// called, the permutation is no longer valid (as, in general, there + /// is no permutation that will make computation dimensions to + /// output dimensions); methods that manipulate perm_ are obligated + /// to test that !has_coalesced_dimensions + DimVector perm_; + + /// Has coalesce_dimensions() (or any moral equivalent, e.g., fast_build()) + /// been called? This is SOLELY used to check validity of perm_. + bool has_coalesced_dimensions_ = false; + + /// Whether iteration must be fixed. This disables dimension permuting and + /// also changes how for_each divides work among threads. + bool enforce_linear_iteration_ = false; + + /// The index offsets into the original tensors for each dimension. + /// This is only non-zero when you narrow() a TensorIterator (e.g., + /// when you make sub-TensorIterators). + DimVector view_offsets_; + + /// The computed names of the output tensor. Computed by compute_names() + NameVector names_; + + /// The operands of the TensorIterator: both the inputs and outputs. The + /// outputs MUST come first in the operands_ list. There is always an + /// operand for each output of the TensorIterator, even if TensorIterator + /// will ultimately be responsible for allocating the output; in those + /// cases, tensor is simply undefined (and will be populated later + /// during build()). + /// + /// This list is initially populated prior to build(), but build() mutates + /// OperandInfo to populate more information. + SmallVector operands_; + + /// Number of outputs in operands_ (the length of the outputs prefix + /// in operands_). + int num_outputs_ = 0; + + /// Whether or not all operands have the same shape and are 1d+. Having all + /// the same shape affects whether or not the iterator is eligible for fast + /// setup. + bool all_ops_same_shape_ = false; + /// Whether or not all operands are 0d, this affects type promotion + bool all_ops_are_scalars_ = false; + + /// The "computation" dtype of TensorIterator, specifying what the dtype + /// we will do the internal computation in TensorIterator. Typically, + /// this matches the dtype of the output tensors, but not always! + ScalarType common_dtype_ = ScalarType::Undefined; + + /// This is currently defined as kCPU, or the device of the first non-CPU + /// tensor argument. See TensorIteratorBase::compute_types for details. + Device common_device_ = kCPU; + + /// Set by split(), see should_accumulate() and is_final_output() + bool accumulate_ = false; + bool final_output_ = true; + + // From TensorIteratorConfig + bool is_reduction_ = false; + + /// Set by populate_operands(), says if we're handling meta tensors + bool is_meta_ = false; +}; + +struct TORCH_API TensorIterator final : public TensorIteratorBase { + TensorIterator() : TensorIteratorBase() {} + // Slicing is OK, TensorIterator guaranteed NOT to have any fields + TensorIterator(const TensorIteratorBase& iter) : TensorIteratorBase(iter) {} + +#define TORCH_DISALLOW_TEMPORARIES(methodname) \ + TORCH_DISALLOW_TEMPORARIES_IMPL(methodname, static) + + static TensorIterator binary_float_op( + TensorBase& out, + const TensorBase& a, + const TensorBase& b); + static TensorIterator binary_op( + TensorBase& out, + const TensorBase& a, + const TensorBase& b); + static TensorIterator borrowing_binary_op( + const TensorBase& out, + const TensorBase& a, + const TensorBase& b); + TORCH_DISALLOW_TEMPORARIES(borrowing_binary_op) + static TensorIterator comparison_op( + TensorBase& out, + const TensorBase& a, + const TensorBase& b); + static TensorIterator unary_op(TensorBase& out, const TensorBase& a); + static TensorIterator unary_float_op(TensorBase& out, const TensorBase& a); + static TensorIterator nullary_op(TensorBase& out); + static TensorIterator borrowing_nullary_op(const TensorBase& out); + static TensorIterator borrowing_nullary_op(TensorBase&& out) = delete; + static TensorIterator reduce_op(TensorBase& out, const TensorBase& a); + static TensorIterator reduce_op( + TensorBase& out1, + TensorBase& out2, + const TensorBase& a); +#undef TORCH_DISALLOW_TEMPORARIES +#undef TORCH_DISALLOW_TEMPORARIES_IMPL + + const Tensor& maybe_get_output(int64_t output_idx) override; + void set_output_raw_strided( + int64_t output_idx, + IntArrayRef sizes, + IntArrayRef strides, + TensorOptions options, + DimnameList names) override; +}; + +class TORCH_API TensorIteratorConfig final { + public: + friend struct TensorIteratorBase; + friend struct TensorIterator; + + TensorIteratorConfig() = default; + + C10_DISABLE_COPY_AND_ASSIGN(TensorIteratorConfig); + + /// Construction + // Stores input/output Tensors without incrementing the reference count. + // Important: the outputs have to be added before the inputs. + TensorIteratorConfig& add_output(const TensorBase& output) { + return add_borrowed_output(output); + } + TensorIteratorConfig& add_input(const TensorBase& input) { + return add_borrowed_input(input); + } + TensorIteratorConfig& add_const_input(const TensorBase& input) { + return add_borrowed_const_input(input); + } + + // Borrowing from temporaries is unlikely to go well. + TensorIteratorConfig& add_output(TensorBase&& output) = delete; + TensorIteratorConfig& add_input(TensorBase&& input) = delete; + TensorIteratorConfig& add_const_input(TensorBase&& input) = delete; + + // Stores input/output Tensors while incrementing the reference count. + // Note that add_{in,out}put are nearly always what you + // want, and the exception (adding an unnamed temporary) won't + // compile. + TensorIteratorConfig& add_owned_output(const TensorBase& output); + TensorIteratorConfig& add_owned_input(const TensorBase& input); + TensorIteratorConfig& add_owned_const_input(const TensorBase& input); + + // Advanced API: stores input/output Tensors without incrementing + // the reference count. The caller must ensure that these Tensors + // live at least as long as this TensorIteratorConfig and any + // TensorIteratorBase built from this TensorIteratorConfig. + // Important: the outputs have to be added before the inputs. + TensorIteratorConfig& add_borrowed_output(const TensorBase& output); + TensorIteratorConfig& add_borrowed_input(const TensorBase& input); + TensorIteratorConfig& add_borrowed_const_input(const TensorBase& input); + + // Borrowing from temporaries is unlikely to go well. + TensorIteratorConfig& add_borrowed_output(TensorBase&& output) = delete; + TensorIteratorConfig& add_borrowed_input(TensorBase&& input) = delete; + TensorIteratorConfig& add_borrowed_const_input(TensorBase&& input) = delete; + + // Sets the check_mem_overlap_ flag, which is true by default. + // If true, inputs are checked for partial overlap with the outputs and + // outputs are checked for internal overlap (e.g. broadcasted views). An error + // is raised if unacceptable overlap is detected. + // If you're migrating an existing operator to using TensorIterator, please + // consider if the previous implementation checked memory overlap. If it did + // not, and if the operator is idempotent (for example, Tensor.fill_(0)), then + // checking memory overlap is BC-breaking. Please don't check memory overlap + // in that case. + TensorIteratorConfig& set_check_mem_overlap(bool check_mem_overlap) { + check_mem_overlap_ = check_mem_overlap; + return *this; + } + + // Sets the check_all_same_dtype_ flag, which is true by default + // If true, checks that all inputs and defined outputs have the same dtype + // Setting either of promote_inputs_to_common_dtype_ + // or cast_common_dtype_to_outputs_ to true will set + // check_all_same_dtype_ to false. + TensorIteratorConfig& check_all_same_dtype(const bool _check_all_same_dtype) { + check_all_same_dtype_ = _check_all_same_dtype; + return *this; + } + + // Sets the check_all_same_device_ flag, which is true by default + // If true, all operands must be on the same device, with the possible + // exception of CPU scalars, which can be passed to some CUDA kernels + // as kernel arguments. + TensorIteratorConfig& check_all_same_device( + const bool _check_all_same_device) { + check_all_same_device_ = _check_all_same_device; + return *this; + } + + // Sets the enforce_safe_casting_to_output_ flag, which is false by default + // If true, the iterator's "common dtype" must be computable + // (see the [Common Dtype Computation] note) and + // canCast(common dtype, output dtype) must be true for all outputs. + TensorIteratorConfig& enforce_safe_casting_to_output( + const bool _enforce_safe_casting_to_output) { + enforce_safe_casting_to_output_ = _enforce_safe_casting_to_output; + return *this; + } + + // Sets the enforce_linear_iteration_ flag, which is false by default. + // If true, iteration goes in the same order as a C-contiguous tensor + // is layed out in memory. i.e. last dimension iterates fastest. + // + // This iteration order can be less efficient and may even prevent + // vectorization. So only use if the correctness of your kernel depends on it. + TensorIteratorConfig& enforce_linear_iteration( + const bool _enforce_linear_iteration = true) { + enforce_linear_iteration_ = _enforce_linear_iteration; + return *this; + } + + // Sets the promote_inputs_to_common_dtype_ flag, which is false by default + // If true, the iterator's "common dtype" is always computed (see the + // [Common Dtype Computation] note) and, on the CPU, temporary copies of + // the inputs in the common dtype are passed as the actual inputs to + // the operation. + // Setting this flag to true sets check_all_same_dtype_ to false. + TensorIteratorConfig& promote_inputs_to_common_dtype( + const bool _promote_inputs_to_common_dtype) { + promote_inputs_to_common_dtype_ = _promote_inputs_to_common_dtype; + if (_promote_inputs_to_common_dtype) { + check_all_same_dtype_ = false; + } + return *this; + } + + // Sets the promote_integer_inputs_to_float_ flag, which is false by default + // NOTE: If set to true, the promote_inputs_to_common_dtype_ must also be + // true. If true, if the iterator's "common dtype" is an integral type + // (including bool) + // then it is changed to the default float scalar type. + TensorIteratorConfig& promote_integer_inputs_to_float( + const bool _promote_integer_inputs_to_float) { + promote_integer_inputs_to_float_ = _promote_integer_inputs_to_float; + TORCH_INTERNAL_ASSERT( + !promote_integer_inputs_to_float_ || promote_inputs_to_common_dtype_); + return *this; + } + + TensorIteratorConfig& is_reduction(const bool _is_reduction) { + is_reduction_ = _is_reduction; + return *this; + } + + TensorIteratorConfig& allow_cpu_scalars(const bool _allow_cpu_scalars) { + allow_cpu_scalars_ = _allow_cpu_scalars; + return *this; + } + + // Sets the cast_common_dtype_to_outputs_ flag, which is false by default + // If true, the iterator's "common dtype" must be computatable + // (see the [Common Dtype Computation] note) and, on the CPU, temporary + // copies of the outputs are passed as the actual output to the operation. + // These temporaries are then copied to the original outputs after + // the operation is performed (see cast_outputs()). + // Setting this flag to true sets check_all_same_dtype_ to false. + TensorIteratorConfig& cast_common_dtype_to_outputs( + const bool _cast_common_dtype_to_outputs) { + cast_common_dtype_to_outputs_ = _cast_common_dtype_to_outputs; + if (_cast_common_dtype_to_outputs) { + check_all_same_dtype_ = false; + } + return *this; + } + + TensorIteratorConfig& resize_outputs(bool resize_outputs) { + resize_outputs_ = resize_outputs; + return *this; + } + + // Bypass output dtype/device computation and fix the dtype/device as + // specified here. + TensorIteratorConfig& declare_static_dtype_and_device( + ScalarType dtype, + Device device); + TensorIteratorConfig& declare_static_dtype(ScalarType dtype); + TensorIteratorConfig& declare_static_device(Device device); + TensorIteratorConfig& declare_static_shape(IntArrayRef shape); + TensorIteratorConfig& declare_static_shape( + IntArrayRef shape, + IntArrayRef squash_dims); + + // It would be better if this was && qualified, but this would be at the cost + // of a lot of boilerplate above + TensorIterator build() { + TensorIterator iter; + iter.build(*this); + return iter; + } + + private: + bool is_tensor_const(size_t idx); + + SmallVector, 4> tensors_; + int num_outputs_ = 0; + int num_inputs_ = 0; + + c10::optional static_shape_ = c10::nullopt; + c10::optional static_dtype_ = c10::nullopt; + c10::optional static_device_ = c10::nullopt; + bool check_mem_overlap_ = true; + bool allow_cpu_scalars_ = false; + bool is_reduction_ = false; + bool resize_outputs_ = true; + bool check_all_same_dtype_ = true; + bool check_all_same_device_ = true; + bool enforce_safe_casting_to_output_ = false; + bool enforce_linear_iteration_ = false; + bool promote_inputs_to_common_dtype_ = false; + bool promote_integer_inputs_to_float_ = false; + bool cast_common_dtype_to_outputs_ = false; + + SmallVector const_tensor_indices_; +}; + +/// A container-like struct that acts as if it contains splits of a +/// TensorIterator that can use 32-bit indexing. Taken together the splits cover +/// the original TensorIterator. +struct TORCH_API SplitUntil32Bit { + struct TORCH_API iterator { + iterator() = default; + iterator(const TensorIteratorBase& iter); + iterator(iterator&&) = default; + + // Guaranteed to be a TensorIterator proper! + TensorIterator& operator*() const; + iterator& operator++(); + bool operator==(const iterator& other) const { + // two iterators are equal if they are the same object or they're both + // empty + return this == &other || (vec.empty() && other.vec.empty()); + } + // needed for C++11 range-based for loop + bool operator!=(const iterator& other) const { + return !(*this == other); + } + + /// stack of TensorIterators to be split + std::vector> vec; + }; + + SplitUntil32Bit(const TensorIteratorBase& iter) : iter(iter) {} + + iterator begin() const; + iterator end() const; + + private: + // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) + const TensorIteratorBase& iter; +}; + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/div_rtn.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/div_rtn.h new file mode 100644 index 0000000000000000000000000000000000000000..4935f49ae2726389441e4012cc15bcf3981f2e84 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/div_rtn.h @@ -0,0 +1,11 @@ +#pragma once + +// Integer division rounding to -Infinity +template +static inline T div_rtn(T x, T y) { + int q = x / y; + int r = x % y; + if ((r != 0) && ((r < 0) != (y < 0))) + --q; + return q; +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/jit_macros.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/jit_macros.h new file mode 100644 index 0000000000000000000000000000000000000000..9af826549021a0853beb83c74b6ac695728ab054 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/jit_macros.h @@ -0,0 +1,7 @@ +#pragma once +#include +#include + +// AT_USE_JITERATOR(), controls whether we jit some elementwise kernels +#define AT_USE_JITERATOR() true +#define jiterator_stringify(...) std::string(#__VA_ARGS__); diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/jiterator_macros.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/jiterator_macros.h new file mode 100644 index 0000000000000000000000000000000000000000..3aa4c7ebb0af07fd65012d9d531aaa140dd6c212 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/jiterator_macros.h @@ -0,0 +1,38 @@ +#pragma once +#include +#include + +#define JITERATOR_HOST_DEVICE C10_HOST_DEVICE +#if defined(_MSC_VER) && defined(__CUDACC__) +// NVRTC on Windows errors if __host__ __device__ attribute is +// present on kernel. +// error: attribute "__host__" does not apply here +// error: attribute "__device__" does not apply here +#define JITERATOR_HOST_DEVICE +#endif + +// jiterator_also_stringify_as macro is used to define code (for CPU/ROCm) +// and generate code string for `jiterator` (only when compiling for CUDA). +// Usage : +// jiterator_also_stringify_as( +// jiterator_code(template T identity(T x) { return x; }), +// identity_string); +// This will define the template `identity` as present in code and +// also define `std::string identity_string` with the code as the string +// if this is being compiled for CUDA. + +// `jiterator_code` macro is to deal with `,` in the kernel code. +// These `,`s confuse the preprocessor into thinking we are passing +// multiple arguments to the macro. +#define jiterator_code(...) __VA_ARGS__ +#if defined(__CUDACC__) || defined(__HIPCC__) +// CPU and CUDA and ROCm case +#define stringify_code(...) #__VA_ARGS__ +#define jiterator_also_stringify_as(code, str_name) \ + code /* define the function */ \ + const std::string str_name = std::string(stringify_code(code)); +#else +// CPU only or CPU and ROCm case +// Only needs the function +#define jiterator_also_stringify_as(code, str_name) code +#endif diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/caffe2/serialize/crc_alt.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/caffe2/serialize/crc_alt.h new file mode 100644 index 0000000000000000000000000000000000000000..9d1c4f1dc7ddc8997f7cc1297ef20d74de67afe0 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/caffe2/serialize/crc_alt.h @@ -0,0 +1,1343 @@ +#pragma once + +// ////////////////////////////////////////////////////////// +// Crc32.h +// Copyright (c) 2011-2019 Stephan Brumme. All rights reserved. +// Slicing-by-16 contributed by Bulat Ziganshin +// Tableless bytewise CRC contributed by Hagai Gold +// see http://create.stephan-brumme.com/disclaimer.html +// + +// if running on an embedded system, you might consider shrinking the +// big Crc32Lookup table by undefining these lines: +#define CRC32_USE_LOOKUP_TABLE_BYTE +#define CRC32_USE_LOOKUP_TABLE_SLICING_BY_4 +#define CRC32_USE_LOOKUP_TABLE_SLICING_BY_8 +#define CRC32_USE_LOOKUP_TABLE_SLICING_BY_16 +// - crc32_bitwise doesn't need it at all +// - crc32_halfbyte has its own small lookup table +// - crc32_1byte_tableless and crc32_1byte_tableless2 don't need it at all +// - crc32_1byte needs only Crc32Lookup[0] +// - crc32_4bytes needs only Crc32Lookup[0..3] +// - crc32_8bytes needs only Crc32Lookup[0..7] +// - crc32_4x8bytes needs only Crc32Lookup[0..7] +// - crc32_16bytes needs all of Crc32Lookup +// using the aforementioned #defines the table is automatically fitted to your needs + +// uint8_t, uint32_t, int32_t +#include +// size_t +#include + +// crc32_fast selects the fastest algorithm depending on flags (CRC32_USE_LOOKUP_...) +/// compute CRC32 using the fastest algorithm for large datasets on modern CPUs +uint32_t crc32_fast (const void* data, size_t length, uint32_t previousCrc32 = 0); + +/// merge two CRC32 such that result = crc32(dataB, lengthB, crc32(dataA, lengthA)) +uint32_t crc32_combine (uint32_t crcA, uint32_t crcB, size_t lengthB); + +/// compute CRC32 (bitwise algorithm) +uint32_t crc32_bitwise (const void* data, size_t length, uint32_t previousCrc32 = 0); +/// compute CRC32 (half-byte algoritm) +uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32 = 0); + +#ifdef CRC32_USE_LOOKUP_TABLE_BYTE +/// compute CRC32 (standard algorithm) +uint32_t crc32_1byte (const void* data, size_t length, uint32_t previousCrc32 = 0); +#endif + +/// compute CRC32 (byte algorithm) without lookup tables +uint32_t crc32_1byte_tableless (const void* data, size_t length, uint32_t previousCrc32 = 0); +/// compute CRC32 (byte algorithm) without lookup tables +uint32_t crc32_1byte_tableless2(const void* data, size_t length, uint32_t previousCrc32 = 0); + +#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_4 +/// compute CRC32 (Slicing-by-4 algorithm) +uint32_t crc32_4bytes (const void* data, size_t length, uint32_t previousCrc32 = 0); +#endif + +#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_8 +/// compute CRC32 (Slicing-by-8 algorithm) +uint32_t crc32_8bytes (const void* data, size_t length, uint32_t previousCrc32 = 0); +/// compute CRC32 (Slicing-by-8 algorithm), unroll inner loop 4 times +uint32_t crc32_4x8bytes(const void* data, size_t length, uint32_t previousCrc32 = 0); +#endif + +#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16 +/// compute CRC32 (Slicing-by-16 algorithm) +uint32_t crc32_16bytes (const void* data, size_t length, uint32_t previousCrc32 = 0); +/// compute CRC32 (Slicing-by-16 algorithm, prefetch upcoming data blocks) +uint32_t crc32_16bytes_prefetch(const void* data, size_t length, uint32_t previousCrc32 = 0, size_t prefetchAhead = 256); +#endif + +// ////////////////////////////////////////////////////////// +// Crc32.cpp +// Copyright (c) 2011-2019 Stephan Brumme. All rights reserved. +// Slicing-by-16 contributed by Bulat Ziganshin +// Tableless bytewise CRC contributed by Hagai Gold +// see http://create.stephan-brumme.com/disclaimer.html +// + +// if running on an embedded system, you might consider shrinking the +// big Crc32Lookup table: +// - crc32_bitwise doesn't need it at all +// - crc32_halfbyte has its own small lookup table +// - crc32_1byte needs only Crc32Lookup[0] +// - crc32_4bytes needs only Crc32Lookup[0..3] +// - crc32_8bytes needs only Crc32Lookup[0..7] +// - crc32_4x8bytes needs only Crc32Lookup[0..7] +// - crc32_16bytes needs all of Crc32Lookup + + +#ifndef __LITTLE_ENDIAN + #define __LITTLE_ENDIAN 1234 +#endif +#ifndef __BIG_ENDIAN + #define __BIG_ENDIAN 4321 +#endif + +// define endianess and some integer data types +#if defined(_MSC_VER) || defined(__MINGW32__) + // Windows always little endian + #define __BYTE_ORDER __LITTLE_ENDIAN + + // intrinsics / prefetching + #if defined(_M_ARM64) + #include + #else + #include + #endif + + #ifdef __MINGW32__ + #define PREFETCH(location) __builtin_prefetch(location) + #else + #if defined(_M_ARM64) + #define PREFETCH(location) __prefetch(location) + #else + #define PREFETCH(location) _mm_prefetch(location, _MM_HINT_T0) + #endif + #endif +#elif defined(__APPLE__) + #include + #if TARGET_IPHONE_SIMULATOR + #define __BYTE_ORDER __LITTLE_ENDIAN + #elif TARGET_OS_IPHONE + #define __BYTE_ORDER __LITTLE_ENDIAN + #elif TARGET_OS_MAC + #include + #if defined(__BIG_ENDIAN__) + #define __BYTE_ORDER __BIG_ENDIAN + #endif + #if defined(__LITTLE_ENDIAN__) + #define __BYTE_ORDER __LITTLE_ENDIAN + #endif + #else + # error "Unknown Apple platform" + #endif +#elif defined(__ARMEB__) + #define __BYTE_ORDER __BIG_ENDIAN +#elif (defined(__BYTE_ORDER__) and !defined(__BYTE_ORDER)) + #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + #define __BYTE_ORDER __BIG_ENDIAN + #else + #define __BYTE_ORDER __LITTLE_ENDIAN + #endif +#else + // defines __BYTE_ORDER as __LITTLE_ENDIAN or __BIG_ENDIAN + #include +#endif + +// intrinsics / prefetching +#ifdef __GNUC__ + #define PREFETCH(location) __builtin_prefetch(location) +#else +#ifndef PREFETCH + // no prefetching + #define PREFETCH(location) ; +#endif +#endif + +// abort if byte order is undefined +#ifndef __BYTE_ORDER +#error undefined byte order, compile with -D__BYTE_ORDER=1234 (if little endian) or -D__BYTE_ORDER=4321 (big endian) +#endif + + +namespace +{ + /// zlib's CRC32 polynomial + const uint32_t Polynomial = 0xEDB88320; + + /// swap endianess + static inline uint32_t swap(uint32_t x) + { + #if defined(__GNUC__) || defined(__clang__) + return __builtin_bswap32(x); + #else + return (x >> 24) | + ((x >> 8) & 0x0000FF00) | + ((x << 8) & 0x00FF0000) | + (x << 24); + #endif + } + + /// Slicing-By-16 + #ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16 + const size_t MaxSlice = 16; + #elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_8) + const size_t MaxSlice = 8; + #elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_4) + const size_t MaxSlice = 4; + #elif defined(CRC32_USE_LOOKUP_TABLE_BYTE) + const size_t MaxSlice = 1; + #else + #define NO_LUT // don't need Crc32Lookup at all + #endif + +} // anonymous namespace + +#ifndef NO_LUT +/// forward declaration, table is at the end of this file +extern const uint32_t Crc32Lookup[MaxSlice][256]; // extern is needed to keep compiler happy +#endif + + +/// compute CRC32 (bitwise algorithm) +uint32_t crc32_bitwise(const void* data, size_t length, uint32_t previousCrc32) +{ + uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF + const uint8_t* current = (const uint8_t*) data; + + while (length-- != 0) + { + crc ^= *current++; + + for (int j = 0; j < 8; j++) + { + // branch-free + crc = (crc >> 1) ^ (-int32_t(crc & 1) & Polynomial); + + // branching, much slower: + //if (crc & 1) + // crc = (crc >> 1) ^ Polynomial; + //else + // crc = crc >> 1; + } + } + + return ~crc; // same as crc ^ 0xFFFFFFFF +} + + +/// compute CRC32 (half-byte algoritm) +uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32) +{ + uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF + const uint8_t* current = (const uint8_t*) data; + + /// look-up table for half-byte, same as crc32Lookup[0][16*i] + static const uint32_t Crc32Lookup16[16] = + { + 0x00000000,0x1DB71064,0x3B6E20C8,0x26D930AC,0x76DC4190,0x6B6B51F4,0x4DB26158,0x5005713C, + 0xEDB88320,0xF00F9344,0xD6D6A3E8,0xCB61B38C,0x9B64C2B0,0x86D3D2D4,0xA00AE278,0xBDBDF21C + }; + + while (length-- != 0) + { + crc = Crc32Lookup16[(crc ^ *current ) & 0x0F] ^ (crc >> 4); + crc = Crc32Lookup16[(crc ^ (*current >> 4)) & 0x0F] ^ (crc >> 4); + current++; + } + + return ~crc; // same as crc ^ 0xFFFFFFFF +} + + +#ifdef CRC32_USE_LOOKUP_TABLE_BYTE +/// compute CRC32 (standard algorithm) +uint32_t crc32_1byte(const void* data, size_t length, uint32_t previousCrc32) +{ + uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF + const uint8_t* current = (const uint8_t*) data; + + while (length-- != 0) + crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *current++]; + + return ~crc; // same as crc ^ 0xFFFFFFFF +} +#endif + + +/// compute CRC32 (byte algorithm) without lookup tables +uint32_t crc32_1byte_tableless(const void* data, size_t length, uint32_t previousCrc32) +{ + uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF + const uint8_t* current = (const uint8_t*) data; + + while (length-- != 0) + { + uint8_t s = uint8_t(crc) ^ *current++; + + // Hagai Gold made me aware of this table-less algorithm and send me code + + // polynomial 0xEDB88320 can be written in binary as 11101101101110001000001100100000b + // reverse the bits (or just assume bit 0 is the first one) + // and we have bits set at position 0, 1, 2, 4, 5, 7, 8, 10, 11, 12, 16, 22, 23, 26 + // => those are the shift offsets: + //crc = (crc >> 8) ^ + // t ^ + // (t >> 1) ^ (t >> 2) ^ (t >> 4) ^ (t >> 5) ^ // == y + // (t >> 7) ^ (t >> 8) ^ (t >> 10) ^ (t >> 11) ^ // == y >> 6 + // (t >> 12) ^ (t >> 16) ^ // == z + // (t >> 22) ^ (t >> 26) ^ // == z >> 10 + // (t >> 23); + + // the fastest I can come up with: + uint32_t low = (s ^ (s << 6)) & 0xFF; + uint32_t a = (low * ((1 << 23) + (1 << 14) + (1 << 2))); + crc = (crc >> 8) ^ + (low * ((1 << 24) + (1 << 16) + (1 << 8))) ^ + a ^ + (a >> 1) ^ + (low * ((1 << 20) + (1 << 12) )) ^ + (low << 19) ^ + (low << 17) ^ + (low >> 2); + + // Hagai's code: + /*uint32_t t = (s ^ (s << 6)) << 24; + // some temporaries to optimize XOR + uint32_t x = (t >> 1) ^ (t >> 2); + uint32_t y = x ^ (x >> 3); + uint32_t z = (t >> 12) ^ (t >> 16); + crc = (crc >> 8) ^ + t ^ (t >> 23) ^ + y ^ (y >> 6) ^ + z ^ (z >> 10);*/ + } + + return ~crc; // same as crc ^ 0xFFFFFFFF +} + + +/// compute CRC32 (byte algorithm) without lookup tables +uint32_t crc32_1byte_tableless2(const void* data, size_t length, uint32_t previousCrc32) +{ + int32_t crc = ~previousCrc32; // note: signed integer, right shift distributes sign bit into lower bits + const uint8_t* current = (const uint8_t*) data; + + while (length-- != 0) + { + crc = crc ^ *current++; + + uint32_t c = (((crc << 31) >> 31) & ((Polynomial >> 7) ^ (Polynomial >> 1))) ^ + (((crc << 30) >> 31) & ((Polynomial >> 6) ^ Polynomial)) ^ + (((crc << 29) >> 31) & (Polynomial >> 5)) ^ + (((crc << 28) >> 31) & (Polynomial >> 4)) ^ + (((crc << 27) >> 31) & (Polynomial >> 3)) ^ + (((crc << 26) >> 31) & (Polynomial >> 2)) ^ + (((crc << 25) >> 31) & (Polynomial >> 1)) ^ + (((crc << 24) >> 31) & Polynomial); + + crc = ((uint32_t)crc >> 8) ^ c; // convert to unsigned integer before right shift + } + + return ~crc; // same as crc ^ 0xFFFFFFFF +} + + +#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_4 +/// compute CRC32 (Slicing-by-4 algorithm) +uint32_t crc32_4bytes(const void* data, size_t length, uint32_t previousCrc32) +{ + uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF + const uint32_t* current = (const uint32_t*) data; + + // process four bytes at once (Slicing-by-4) + while (length >= 4) + { +#if __BYTE_ORDER == __BIG_ENDIAN + uint32_t one = *current++ ^ swap(crc); + crc = Crc32Lookup[0][ one & 0xFF] ^ + Crc32Lookup[1][(one>> 8) & 0xFF] ^ + Crc32Lookup[2][(one>>16) & 0xFF] ^ + Crc32Lookup[3][(one>>24) & 0xFF]; +#else + uint32_t one = *current++ ^ crc; + crc = Crc32Lookup[0][(one>>24) & 0xFF] ^ + Crc32Lookup[1][(one>>16) & 0xFF] ^ + Crc32Lookup[2][(one>> 8) & 0xFF] ^ + Crc32Lookup[3][ one & 0xFF]; +#endif + + length -= 4; + } + + const uint8_t* currentChar = (const uint8_t*) current; + // remaining 1 to 3 bytes (standard algorithm) + while (length-- != 0) + crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++]; + + return ~crc; // same as crc ^ 0xFFFFFFFF +} +#endif + + +#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_8 +/// compute CRC32 (Slicing-by-8 algorithm) +uint32_t crc32_8bytes(const void* data, size_t length, uint32_t previousCrc32) +{ + uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF + const uint32_t* current = (const uint32_t*) data; + + // process eight bytes at once (Slicing-by-8) + while (length >= 8) + { +#if __BYTE_ORDER == __BIG_ENDIAN + uint32_t one = *current++ ^ swap(crc); + uint32_t two = *current++; + crc = Crc32Lookup[0][ two & 0xFF] ^ + Crc32Lookup[1][(two>> 8) & 0xFF] ^ + Crc32Lookup[2][(two>>16) & 0xFF] ^ + Crc32Lookup[3][(two>>24) & 0xFF] ^ + Crc32Lookup[4][ one & 0xFF] ^ + Crc32Lookup[5][(one>> 8) & 0xFF] ^ + Crc32Lookup[6][(one>>16) & 0xFF] ^ + Crc32Lookup[7][(one>>24) & 0xFF]; +#else + uint32_t one = *current++ ^ crc; + uint32_t two = *current++; + crc = Crc32Lookup[0][(two>>24) & 0xFF] ^ + Crc32Lookup[1][(two>>16) & 0xFF] ^ + Crc32Lookup[2][(two>> 8) & 0xFF] ^ + Crc32Lookup[3][ two & 0xFF] ^ + Crc32Lookup[4][(one>>24) & 0xFF] ^ + Crc32Lookup[5][(one>>16) & 0xFF] ^ + Crc32Lookup[6][(one>> 8) & 0xFF] ^ + Crc32Lookup[7][ one & 0xFF]; +#endif + + length -= 8; + } + + const uint8_t* currentChar = (const uint8_t*) current; + // remaining 1 to 7 bytes (standard algorithm) + while (length-- != 0) + crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++]; + + return ~crc; // same as crc ^ 0xFFFFFFFF +} + + +/// compute CRC32 (Slicing-by-8 algorithm), unroll inner loop 4 times +uint32_t crc32_4x8bytes(const void* data, size_t length, uint32_t previousCrc32) +{ + uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF + const uint32_t* current = (const uint32_t*) data; + + // enabling optimization (at least -O2) automatically unrolls the inner for-loop + const size_t Unroll = 4; + const size_t BytesAtOnce = 8 * Unroll; + + // process 4x eight bytes at once (Slicing-by-8) + while (length >= BytesAtOnce) + { + for (size_t unrolling = 0; unrolling < Unroll; unrolling++) + { +#if __BYTE_ORDER == __BIG_ENDIAN + uint32_t one = *current++ ^ swap(crc); + uint32_t two = *current++; + crc = Crc32Lookup[0][ two & 0xFF] ^ + Crc32Lookup[1][(two>> 8) & 0xFF] ^ + Crc32Lookup[2][(two>>16) & 0xFF] ^ + Crc32Lookup[3][(two>>24) & 0xFF] ^ + Crc32Lookup[4][ one & 0xFF] ^ + Crc32Lookup[5][(one>> 8) & 0xFF] ^ + Crc32Lookup[6][(one>>16) & 0xFF] ^ + Crc32Lookup[7][(one>>24) & 0xFF]; +#else + uint32_t one = *current++ ^ crc; + uint32_t two = *current++; + crc = Crc32Lookup[0][(two>>24) & 0xFF] ^ + Crc32Lookup[1][(two>>16) & 0xFF] ^ + Crc32Lookup[2][(two>> 8) & 0xFF] ^ + Crc32Lookup[3][ two & 0xFF] ^ + Crc32Lookup[4][(one>>24) & 0xFF] ^ + Crc32Lookup[5][(one>>16) & 0xFF] ^ + Crc32Lookup[6][(one>> 8) & 0xFF] ^ + Crc32Lookup[7][ one & 0xFF]; +#endif + + } + + length -= BytesAtOnce; + } + + const uint8_t* currentChar = (const uint8_t*) current; + // remaining 1 to 31 bytes (standard algorithm) + while (length-- != 0) + crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++]; + + return ~crc; // same as crc ^ 0xFFFFFFFF +} +#endif // CRC32_USE_LOOKUP_TABLE_SLICING_BY_8 + + +#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16 +/// compute CRC32 (Slicing-by-16 algorithm) +uint32_t crc32_16bytes(const void* data, size_t length, uint32_t previousCrc32) +{ + uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF + const uint32_t* current = (const uint32_t*) data; + + // enabling optimization (at least -O2) automatically unrolls the inner for-loop + const size_t Unroll = 4; + const size_t BytesAtOnce = 16 * Unroll; + + while (length >= BytesAtOnce) + { + for (size_t unrolling = 0; unrolling < Unroll; unrolling++) + { +#if __BYTE_ORDER == __BIG_ENDIAN + uint32_t one = *current++ ^ swap(crc); + uint32_t two = *current++; + uint32_t three = *current++; + uint32_t four = *current++; + crc = Crc32Lookup[ 0][ four & 0xFF] ^ + Crc32Lookup[ 1][(four >> 8) & 0xFF] ^ + Crc32Lookup[ 2][(four >> 16) & 0xFF] ^ + Crc32Lookup[ 3][(four >> 24) & 0xFF] ^ + Crc32Lookup[ 4][ three & 0xFF] ^ + Crc32Lookup[ 5][(three >> 8) & 0xFF] ^ + Crc32Lookup[ 6][(three >> 16) & 0xFF] ^ + Crc32Lookup[ 7][(three >> 24) & 0xFF] ^ + Crc32Lookup[ 8][ two & 0xFF] ^ + Crc32Lookup[ 9][(two >> 8) & 0xFF] ^ + Crc32Lookup[10][(two >> 16) & 0xFF] ^ + Crc32Lookup[11][(two >> 24) & 0xFF] ^ + Crc32Lookup[12][ one & 0xFF] ^ + Crc32Lookup[13][(one >> 8) & 0xFF] ^ + Crc32Lookup[14][(one >> 16) & 0xFF] ^ + Crc32Lookup[15][(one >> 24) & 0xFF]; +#else + uint32_t one = *current++ ^ crc; + uint32_t two = *current++; + uint32_t three = *current++; + uint32_t four = *current++; + crc = Crc32Lookup[ 0][(four >> 24) & 0xFF] ^ + Crc32Lookup[ 1][(four >> 16) & 0xFF] ^ + Crc32Lookup[ 2][(four >> 8) & 0xFF] ^ + Crc32Lookup[ 3][ four & 0xFF] ^ + Crc32Lookup[ 4][(three >> 24) & 0xFF] ^ + Crc32Lookup[ 5][(three >> 16) & 0xFF] ^ + Crc32Lookup[ 6][(three >> 8) & 0xFF] ^ + Crc32Lookup[ 7][ three & 0xFF] ^ + Crc32Lookup[ 8][(two >> 24) & 0xFF] ^ + Crc32Lookup[ 9][(two >> 16) & 0xFF] ^ + Crc32Lookup[10][(two >> 8) & 0xFF] ^ + Crc32Lookup[11][ two & 0xFF] ^ + Crc32Lookup[12][(one >> 24) & 0xFF] ^ + Crc32Lookup[13][(one >> 16) & 0xFF] ^ + Crc32Lookup[14][(one >> 8) & 0xFF] ^ + Crc32Lookup[15][ one & 0xFF]; +#endif + } + + length -= BytesAtOnce; + } + + const uint8_t* currentChar = (const uint8_t*) current; + // remaining 1 to 63 bytes (standard algorithm) + while (length-- != 0) + crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++]; + + return ~crc; // same as crc ^ 0xFFFFFFFF +} + + +/// compute CRC32 (Slicing-by-16 algorithm, prefetch upcoming data blocks) +uint32_t crc32_16bytes_prefetch(const void* data, size_t length, uint32_t previousCrc32, size_t prefetchAhead) +{ + // CRC code is identical to crc32_16bytes (including unrolling), only added prefetching + // 256 bytes look-ahead seems to be the sweet spot on Core i7 CPUs + + uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF + const uint32_t* current = (const uint32_t*) data; + + // enabling optimization (at least -O2) automatically unrolls the for-loop + const size_t Unroll = 4; + const size_t BytesAtOnce = 16 * Unroll; + + while (length >= BytesAtOnce + prefetchAhead) + { + PREFETCH(((const char*) current) + prefetchAhead); + + for (size_t unrolling = 0; unrolling < Unroll; unrolling++) + { +#if __BYTE_ORDER == __BIG_ENDIAN + uint32_t one = *current++ ^ swap(crc); + uint32_t two = *current++; + uint32_t three = *current++; + uint32_t four = *current++; + crc = Crc32Lookup[ 0][ four & 0xFF] ^ + Crc32Lookup[ 1][(four >> 8) & 0xFF] ^ + Crc32Lookup[ 2][(four >> 16) & 0xFF] ^ + Crc32Lookup[ 3][(four >> 24) & 0xFF] ^ + Crc32Lookup[ 4][ three & 0xFF] ^ + Crc32Lookup[ 5][(three >> 8) & 0xFF] ^ + Crc32Lookup[ 6][(three >> 16) & 0xFF] ^ + Crc32Lookup[ 7][(three >> 24) & 0xFF] ^ + Crc32Lookup[ 8][ two & 0xFF] ^ + Crc32Lookup[ 9][(two >> 8) & 0xFF] ^ + Crc32Lookup[10][(two >> 16) & 0xFF] ^ + Crc32Lookup[11][(two >> 24) & 0xFF] ^ + Crc32Lookup[12][ one & 0xFF] ^ + Crc32Lookup[13][(one >> 8) & 0xFF] ^ + Crc32Lookup[14][(one >> 16) & 0xFF] ^ + Crc32Lookup[15][(one >> 24) & 0xFF]; +#else + uint32_t one = *current++ ^ crc; + uint32_t two = *current++; + uint32_t three = *current++; + uint32_t four = *current++; + crc = Crc32Lookup[ 0][(four >> 24) & 0xFF] ^ + Crc32Lookup[ 1][(four >> 16) & 0xFF] ^ + Crc32Lookup[ 2][(four >> 8) & 0xFF] ^ + Crc32Lookup[ 3][ four & 0xFF] ^ + Crc32Lookup[ 4][(three >> 24) & 0xFF] ^ + Crc32Lookup[ 5][(three >> 16) & 0xFF] ^ + Crc32Lookup[ 6][(three >> 8) & 0xFF] ^ + Crc32Lookup[ 7][ three & 0xFF] ^ + Crc32Lookup[ 8][(two >> 24) & 0xFF] ^ + Crc32Lookup[ 9][(two >> 16) & 0xFF] ^ + Crc32Lookup[10][(two >> 8) & 0xFF] ^ + Crc32Lookup[11][ two & 0xFF] ^ + Crc32Lookup[12][(one >> 24) & 0xFF] ^ + Crc32Lookup[13][(one >> 16) & 0xFF] ^ + Crc32Lookup[14][(one >> 8) & 0xFF] ^ + Crc32Lookup[15][ one & 0xFF]; +#endif + } + + length -= BytesAtOnce; + } + + const uint8_t* currentChar = (const uint8_t*) current; + // remaining 1 to 63 bytes (standard algorithm) + while (length-- != 0) + crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++]; + + return ~crc; // same as crc ^ 0xFFFFFFFF +} +#endif + + +/// compute CRC32 using the fastest algorithm for large datasets on modern CPUs +uint32_t crc32_fast(const void* data, size_t length, uint32_t previousCrc32) +{ +#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16 + return crc32_16bytes (data, length, previousCrc32); +#elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_8) + return crc32_8bytes (data, length, previousCrc32); +#elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_4) + return crc32_4bytes (data, length, previousCrc32); +#elif defined(CRC32_USE_LOOKUP_TABLE_BYTE) + return crc32_1byte (data, length, previousCrc32); +#else + return crc32_halfbyte(data, length, previousCrc32); +#endif +} + + +/// merge two CRC32 such that result = crc32(dataB, lengthB, crc32(dataA, lengthA)) +uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB) +{ + // based on Mark Adler's crc_combine from + // https://github.com/madler/pigz/blob/master/pigz.c + + // main idea: + // - if you have two equally-sized blocks A and B, + // then you can create a block C = A ^ B + // which has the property crc(C) = crc(A) ^ crc(B) + // - if you append length(B) zeros to A and call it A' (think of it as AAAA000) + // and prepend length(A) zeros to B and call it B' (think of it as 0000BBB) + // then exists a C' = A' ^ B' + // - remember: if you XOR someting with zero, it remains unchanged: X ^ 0 = X + // - that means C' = A concat B so that crc(A concat B) = crc(C') = crc(A') ^ crc(B') + // - the trick is to compute crc(A') based on crc(A) + // and crc(B') based on crc(B) + // - since B' starts with many zeros, the crc of those initial zeros is still zero + // - that means crc(B') = crc(B) + // - unfortunately the trailing zeros of A' change the crc, so usually crc(A') != crc(A) + // - the following code is a fast algorithm to compute crc(A') + // - starting with crc(A) and appending length(B) zeros, needing just log2(length(B)) iterations + // - the details are explained by the original author at + // https://stackoverflow.com/questions/23122312/crc-calculation-of-a-mostly-static-data-stream/23126768 + // + // notes: + // - I squeezed everything into one function to keep global namespace clean (original code two helper functions) + // - most original comments are still in place, I added comments where these helper functions where made inline code + // - performance-wise there isn't any differenze to the original zlib/pigz code + + // degenerated case + if (lengthB == 0) + return crcA; + + /// CRC32 => 32 bits + const uint32_t CrcBits = 32; + + uint32_t odd [CrcBits]; // odd-power-of-two zeros operator + uint32_t even[CrcBits]; // even-power-of-two zeros operator + + // put operator for one zero bit in odd + odd[0] = Polynomial; // CRC-32 polynomial + for (uint32_t i = 1; i < CrcBits; i++) + odd[i] = 1 << (i - 1); + + // put operator for two zero bits in even + // same as gf2_matrix_square(even, odd); + for (uint32_t i = 0; i < CrcBits; i++) + { + uint32_t vec = odd[i]; + even[i] = 0; + for (int j = 0; vec != 0; j++, vec >>= 1) + if (vec & 1) + even[i] ^= odd[j]; + } + // put operator for four zero bits in odd + // same as gf2_matrix_square(odd, even); + for (uint32_t i = 0; i < CrcBits; i++) + { + uint32_t vec = even[i]; + odd[i] = 0; + for (int j = 0; vec != 0; j++, vec >>= 1) + if (vec & 1) + odd[i] ^= even[j]; + } + + // the following loop becomes much shorter if I keep swapping even and odd + uint32_t* a = even; + uint32_t* b = odd; + // apply secondLength zeros to firstCrc32 + for (; lengthB > 0; lengthB >>= 1) + { + // same as gf2_matrix_square(a, b); + for (uint32_t i = 0; i < CrcBits; i++) + { + uint32_t vec = b[i]; + a[i] = 0; + for (int j = 0; vec != 0; j++, vec >>= 1) + if (vec & 1) + a[i] ^= b[j]; + } + + // apply zeros operator for this bit + if (lengthB & 1) + { + // same as firstCrc32 = gf2_matrix_times(a, firstCrc32); + uint32_t sum = 0; + for (int i = 0; crcA != 0; i++, crcA >>= 1) + if (crcA & 1) + sum ^= a[i]; + crcA = sum; + } + + // switch even and odd + uint32_t* t = a; a = b; b = t; + } + + // return combined crc + return crcA ^ crcB; +} + + +// ////////////////////////////////////////////////////////// +// constants + + +#ifndef NO_LUT +/// look-up table, already declared above +const uint32_t Crc32Lookup[MaxSlice][256] = +{ + //// same algorithm as crc32_bitwise + //for (int i = 0; i <= 0xFF; i++) + //{ + // uint32_t crc = i; + // for (int j = 0; j < 8; j++) + // crc = (crc >> 1) ^ ((crc & 1) * Polynomial); + // Crc32Lookup[0][i] = crc; + //} + //// ... and the following slicing-by-8 algorithm (from Intel): + //// http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf + //// http://sourceforge.net/projects/slicing-by-8/ + //for (int slice = 1; slice < MaxSlice; slice++) + // Crc32Lookup[slice][i] = (Crc32Lookup[slice - 1][i] >> 8) ^ Crc32Lookup[0][Crc32Lookup[slice - 1][i] & 0xFF]; + { + // note: the first number of every second row corresponds to the half-byte look-up table ! + 0x00000000,0x77073096,0xEE0E612C,0x990951BA,0x076DC419,0x706AF48F,0xE963A535,0x9E6495A3, + 0x0EDB8832,0x79DCB8A4,0xE0D5E91E,0x97D2D988,0x09B64C2B,0x7EB17CBD,0xE7B82D07,0x90BF1D91, + 0x1DB71064,0x6AB020F2,0xF3B97148,0x84BE41DE,0x1ADAD47D,0x6DDDE4EB,0xF4D4B551,0x83D385C7, + 0x136C9856,0x646BA8C0,0xFD62F97A,0x8A65C9EC,0x14015C4F,0x63066CD9,0xFA0F3D63,0x8D080DF5, + 0x3B6E20C8,0x4C69105E,0xD56041E4,0xA2677172,0x3C03E4D1,0x4B04D447,0xD20D85FD,0xA50AB56B, + 0x35B5A8FA,0x42B2986C,0xDBBBC9D6,0xACBCF940,0x32D86CE3,0x45DF5C75,0xDCD60DCF,0xABD13D59, + 0x26D930AC,0x51DE003A,0xC8D75180,0xBFD06116,0x21B4F4B5,0x56B3C423,0xCFBA9599,0xB8BDA50F, + 0x2802B89E,0x5F058808,0xC60CD9B2,0xB10BE924,0x2F6F7C87,0x58684C11,0xC1611DAB,0xB6662D3D, + 0x76DC4190,0x01DB7106,0x98D220BC,0xEFD5102A,0x71B18589,0x06B6B51F,0x9FBFE4A5,0xE8B8D433, + 0x7807C9A2,0x0F00F934,0x9609A88E,0xE10E9818,0x7F6A0DBB,0x086D3D2D,0x91646C97,0xE6635C01, + 0x6B6B51F4,0x1C6C6162,0x856530D8,0xF262004E,0x6C0695ED,0x1B01A57B,0x8208F4C1,0xF50FC457, + 0x65B0D9C6,0x12B7E950,0x8BBEB8EA,0xFCB9887C,0x62DD1DDF,0x15DA2D49,0x8CD37CF3,0xFBD44C65, + 0x4DB26158,0x3AB551CE,0xA3BC0074,0xD4BB30E2,0x4ADFA541,0x3DD895D7,0xA4D1C46D,0xD3D6F4FB, + 0x4369E96A,0x346ED9FC,0xAD678846,0xDA60B8D0,0x44042D73,0x33031DE5,0xAA0A4C5F,0xDD0D7CC9, + 0x5005713C,0x270241AA,0xBE0B1010,0xC90C2086,0x5768B525,0x206F85B3,0xB966D409,0xCE61E49F, + 0x5EDEF90E,0x29D9C998,0xB0D09822,0xC7D7A8B4,0x59B33D17,0x2EB40D81,0xB7BD5C3B,0xC0BA6CAD, + 0xEDB88320,0x9ABFB3B6,0x03B6E20C,0x74B1D29A,0xEAD54739,0x9DD277AF,0x04DB2615,0x73DC1683, + 0xE3630B12,0x94643B84,0x0D6D6A3E,0x7A6A5AA8,0xE40ECF0B,0x9309FF9D,0x0A00AE27,0x7D079EB1, + 0xF00F9344,0x8708A3D2,0x1E01F268,0x6906C2FE,0xF762575D,0x806567CB,0x196C3671,0x6E6B06E7, + 0xFED41B76,0x89D32BE0,0x10DA7A5A,0x67DD4ACC,0xF9B9DF6F,0x8EBEEFF9,0x17B7BE43,0x60B08ED5, + 0xD6D6A3E8,0xA1D1937E,0x38D8C2C4,0x4FDFF252,0xD1BB67F1,0xA6BC5767,0x3FB506DD,0x48B2364B, + 0xD80D2BDA,0xAF0A1B4C,0x36034AF6,0x41047A60,0xDF60EFC3,0xA867DF55,0x316E8EEF,0x4669BE79, + 0xCB61B38C,0xBC66831A,0x256FD2A0,0x5268E236,0xCC0C7795,0xBB0B4703,0x220216B9,0x5505262F, + 0xC5BA3BBE,0xB2BD0B28,0x2BB45A92,0x5CB36A04,0xC2D7FFA7,0xB5D0CF31,0x2CD99E8B,0x5BDEAE1D, + 0x9B64C2B0,0xEC63F226,0x756AA39C,0x026D930A,0x9C0906A9,0xEB0E363F,0x72076785,0x05005713, + 0x95BF4A82,0xE2B87A14,0x7BB12BAE,0x0CB61B38,0x92D28E9B,0xE5D5BE0D,0x7CDCEFB7,0x0BDBDF21, + 0x86D3D2D4,0xF1D4E242,0x68DDB3F8,0x1FDA836E,0x81BE16CD,0xF6B9265B,0x6FB077E1,0x18B74777, + 0x88085AE6,0xFF0F6A70,0x66063BCA,0x11010B5C,0x8F659EFF,0xF862AE69,0x616BFFD3,0x166CCF45, + 0xA00AE278,0xD70DD2EE,0x4E048354,0x3903B3C2,0xA7672661,0xD06016F7,0x4969474D,0x3E6E77DB, + 0xAED16A4A,0xD9D65ADC,0x40DF0B66,0x37D83BF0,0xA9BCAE53,0xDEBB9EC5,0x47B2CF7F,0x30B5FFE9, + 0xBDBDF21C,0xCABAC28A,0x53B39330,0x24B4A3A6,0xBAD03605,0xCDD70693,0x54DE5729,0x23D967BF, + 0xB3667A2E,0xC4614AB8,0x5D681B02,0x2A6F2B94,0xB40BBE37,0xC30C8EA1,0x5A05DF1B,0x2D02EF8D, + } + +#if defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_4) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_8) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_16) + // beyond this point only relevant for Slicing-by-4, Slicing-by-8 and Slicing-by-16 + ,{ + 0x00000000,0x191B3141,0x32366282,0x2B2D53C3,0x646CC504,0x7D77F445,0x565AA786,0x4F4196C7, + 0xC8D98A08,0xD1C2BB49,0xFAEFE88A,0xE3F4D9CB,0xACB54F0C,0xB5AE7E4D,0x9E832D8E,0x87981CCF, + 0x4AC21251,0x53D92310,0x78F470D3,0x61EF4192,0x2EAED755,0x37B5E614,0x1C98B5D7,0x05838496, + 0x821B9859,0x9B00A918,0xB02DFADB,0xA936CB9A,0xE6775D5D,0xFF6C6C1C,0xD4413FDF,0xCD5A0E9E, + 0x958424A2,0x8C9F15E3,0xA7B24620,0xBEA97761,0xF1E8E1A6,0xE8F3D0E7,0xC3DE8324,0xDAC5B265, + 0x5D5DAEAA,0x44469FEB,0x6F6BCC28,0x7670FD69,0x39316BAE,0x202A5AEF,0x0B07092C,0x121C386D, + 0xDF4636F3,0xC65D07B2,0xED705471,0xF46B6530,0xBB2AF3F7,0xA231C2B6,0x891C9175,0x9007A034, + 0x179FBCFB,0x0E848DBA,0x25A9DE79,0x3CB2EF38,0x73F379FF,0x6AE848BE,0x41C51B7D,0x58DE2A3C, + 0xF0794F05,0xE9627E44,0xC24F2D87,0xDB541CC6,0x94158A01,0x8D0EBB40,0xA623E883,0xBF38D9C2, + 0x38A0C50D,0x21BBF44C,0x0A96A78F,0x138D96CE,0x5CCC0009,0x45D73148,0x6EFA628B,0x77E153CA, + 0xBABB5D54,0xA3A06C15,0x888D3FD6,0x91960E97,0xDED79850,0xC7CCA911,0xECE1FAD2,0xF5FACB93, + 0x7262D75C,0x6B79E61D,0x4054B5DE,0x594F849F,0x160E1258,0x0F152319,0x243870DA,0x3D23419B, + 0x65FD6BA7,0x7CE65AE6,0x57CB0925,0x4ED03864,0x0191AEA3,0x188A9FE2,0x33A7CC21,0x2ABCFD60, + 0xAD24E1AF,0xB43FD0EE,0x9F12832D,0x8609B26C,0xC94824AB,0xD05315EA,0xFB7E4629,0xE2657768, + 0x2F3F79F6,0x362448B7,0x1D091B74,0x04122A35,0x4B53BCF2,0x52488DB3,0x7965DE70,0x607EEF31, + 0xE7E6F3FE,0xFEFDC2BF,0xD5D0917C,0xCCCBA03D,0x838A36FA,0x9A9107BB,0xB1BC5478,0xA8A76539, + 0x3B83984B,0x2298A90A,0x09B5FAC9,0x10AECB88,0x5FEF5D4F,0x46F46C0E,0x6DD93FCD,0x74C20E8C, + 0xF35A1243,0xEA412302,0xC16C70C1,0xD8774180,0x9736D747,0x8E2DE606,0xA500B5C5,0xBC1B8484, + 0x71418A1A,0x685ABB5B,0x4377E898,0x5A6CD9D9,0x152D4F1E,0x0C367E5F,0x271B2D9C,0x3E001CDD, + 0xB9980012,0xA0833153,0x8BAE6290,0x92B553D1,0xDDF4C516,0xC4EFF457,0xEFC2A794,0xF6D996D5, + 0xAE07BCE9,0xB71C8DA8,0x9C31DE6B,0x852AEF2A,0xCA6B79ED,0xD37048AC,0xF85D1B6F,0xE1462A2E, + 0x66DE36E1,0x7FC507A0,0x54E85463,0x4DF36522,0x02B2F3E5,0x1BA9C2A4,0x30849167,0x299FA026, + 0xE4C5AEB8,0xFDDE9FF9,0xD6F3CC3A,0xCFE8FD7B,0x80A96BBC,0x99B25AFD,0xB29F093E,0xAB84387F, + 0x2C1C24B0,0x350715F1,0x1E2A4632,0x07317773,0x4870E1B4,0x516BD0F5,0x7A468336,0x635DB277, + 0xCBFAD74E,0xD2E1E60F,0xF9CCB5CC,0xE0D7848D,0xAF96124A,0xB68D230B,0x9DA070C8,0x84BB4189, + 0x03235D46,0x1A386C07,0x31153FC4,0x280E0E85,0x674F9842,0x7E54A903,0x5579FAC0,0x4C62CB81, + 0x8138C51F,0x9823F45E,0xB30EA79D,0xAA1596DC,0xE554001B,0xFC4F315A,0xD7626299,0xCE7953D8, + 0x49E14F17,0x50FA7E56,0x7BD72D95,0x62CC1CD4,0x2D8D8A13,0x3496BB52,0x1FBBE891,0x06A0D9D0, + 0x5E7EF3EC,0x4765C2AD,0x6C48916E,0x7553A02F,0x3A1236E8,0x230907A9,0x0824546A,0x113F652B, + 0x96A779E4,0x8FBC48A5,0xA4911B66,0xBD8A2A27,0xF2CBBCE0,0xEBD08DA1,0xC0FDDE62,0xD9E6EF23, + 0x14BCE1BD,0x0DA7D0FC,0x268A833F,0x3F91B27E,0x70D024B9,0x69CB15F8,0x42E6463B,0x5BFD777A, + 0xDC656BB5,0xC57E5AF4,0xEE530937,0xF7483876,0xB809AEB1,0xA1129FF0,0x8A3FCC33,0x9324FD72, + }, + + { + 0x00000000,0x01C26A37,0x0384D46E,0x0246BE59,0x0709A8DC,0x06CBC2EB,0x048D7CB2,0x054F1685, + 0x0E1351B8,0x0FD13B8F,0x0D9785D6,0x0C55EFE1,0x091AF964,0x08D89353,0x0A9E2D0A,0x0B5C473D, + 0x1C26A370,0x1DE4C947,0x1FA2771E,0x1E601D29,0x1B2F0BAC,0x1AED619B,0x18ABDFC2,0x1969B5F5, + 0x1235F2C8,0x13F798FF,0x11B126A6,0x10734C91,0x153C5A14,0x14FE3023,0x16B88E7A,0x177AE44D, + 0x384D46E0,0x398F2CD7,0x3BC9928E,0x3A0BF8B9,0x3F44EE3C,0x3E86840B,0x3CC03A52,0x3D025065, + 0x365E1758,0x379C7D6F,0x35DAC336,0x3418A901,0x3157BF84,0x3095D5B3,0x32D36BEA,0x331101DD, + 0x246BE590,0x25A98FA7,0x27EF31FE,0x262D5BC9,0x23624D4C,0x22A0277B,0x20E69922,0x2124F315, + 0x2A78B428,0x2BBADE1F,0x29FC6046,0x283E0A71,0x2D711CF4,0x2CB376C3,0x2EF5C89A,0x2F37A2AD, + 0x709A8DC0,0x7158E7F7,0x731E59AE,0x72DC3399,0x7793251C,0x76514F2B,0x7417F172,0x75D59B45, + 0x7E89DC78,0x7F4BB64F,0x7D0D0816,0x7CCF6221,0x798074A4,0x78421E93,0x7A04A0CA,0x7BC6CAFD, + 0x6CBC2EB0,0x6D7E4487,0x6F38FADE,0x6EFA90E9,0x6BB5866C,0x6A77EC5B,0x68315202,0x69F33835, + 0x62AF7F08,0x636D153F,0x612BAB66,0x60E9C151,0x65A6D7D4,0x6464BDE3,0x662203BA,0x67E0698D, + 0x48D7CB20,0x4915A117,0x4B531F4E,0x4A917579,0x4FDE63FC,0x4E1C09CB,0x4C5AB792,0x4D98DDA5, + 0x46C49A98,0x4706F0AF,0x45404EF6,0x448224C1,0x41CD3244,0x400F5873,0x4249E62A,0x438B8C1D, + 0x54F16850,0x55330267,0x5775BC3E,0x56B7D609,0x53F8C08C,0x523AAABB,0x507C14E2,0x51BE7ED5, + 0x5AE239E8,0x5B2053DF,0x5966ED86,0x58A487B1,0x5DEB9134,0x5C29FB03,0x5E6F455A,0x5FAD2F6D, + 0xE1351B80,0xE0F771B7,0xE2B1CFEE,0xE373A5D9,0xE63CB35C,0xE7FED96B,0xE5B86732,0xE47A0D05, + 0xEF264A38,0xEEE4200F,0xECA29E56,0xED60F461,0xE82FE2E4,0xE9ED88D3,0xEBAB368A,0xEA695CBD, + 0xFD13B8F0,0xFCD1D2C7,0xFE976C9E,0xFF5506A9,0xFA1A102C,0xFBD87A1B,0xF99EC442,0xF85CAE75, + 0xF300E948,0xF2C2837F,0xF0843D26,0xF1465711,0xF4094194,0xF5CB2BA3,0xF78D95FA,0xF64FFFCD, + 0xD9785D60,0xD8BA3757,0xDAFC890E,0xDB3EE339,0xDE71F5BC,0xDFB39F8B,0xDDF521D2,0xDC374BE5, + 0xD76B0CD8,0xD6A966EF,0xD4EFD8B6,0xD52DB281,0xD062A404,0xD1A0CE33,0xD3E6706A,0xD2241A5D, + 0xC55EFE10,0xC49C9427,0xC6DA2A7E,0xC7184049,0xC25756CC,0xC3953CFB,0xC1D382A2,0xC011E895, + 0xCB4DAFA8,0xCA8FC59F,0xC8C97BC6,0xC90B11F1,0xCC440774,0xCD866D43,0xCFC0D31A,0xCE02B92D, + 0x91AF9640,0x906DFC77,0x922B422E,0x93E92819,0x96A63E9C,0x976454AB,0x9522EAF2,0x94E080C5, + 0x9FBCC7F8,0x9E7EADCF,0x9C381396,0x9DFA79A1,0x98B56F24,0x99770513,0x9B31BB4A,0x9AF3D17D, + 0x8D893530,0x8C4B5F07,0x8E0DE15E,0x8FCF8B69,0x8A809DEC,0x8B42F7DB,0x89044982,0x88C623B5, + 0x839A6488,0x82580EBF,0x801EB0E6,0x81DCDAD1,0x8493CC54,0x8551A663,0x8717183A,0x86D5720D, + 0xA9E2D0A0,0xA820BA97,0xAA6604CE,0xABA46EF9,0xAEEB787C,0xAF29124B,0xAD6FAC12,0xACADC625, + 0xA7F18118,0xA633EB2F,0xA4755576,0xA5B73F41,0xA0F829C4,0xA13A43F3,0xA37CFDAA,0xA2BE979D, + 0xB5C473D0,0xB40619E7,0xB640A7BE,0xB782CD89,0xB2CDDB0C,0xB30FB13B,0xB1490F62,0xB08B6555, + 0xBBD72268,0xBA15485F,0xB853F606,0xB9919C31,0xBCDE8AB4,0xBD1CE083,0xBF5A5EDA,0xBE9834ED, + }, + + { + 0x00000000,0xB8BC6765,0xAA09C88B,0x12B5AFEE,0x8F629757,0x37DEF032,0x256B5FDC,0x9DD738B9, + 0xC5B428EF,0x7D084F8A,0x6FBDE064,0xD7018701,0x4AD6BFB8,0xF26AD8DD,0xE0DF7733,0x58631056, + 0x5019579F,0xE8A530FA,0xFA109F14,0x42ACF871,0xDF7BC0C8,0x67C7A7AD,0x75720843,0xCDCE6F26, + 0x95AD7F70,0x2D111815,0x3FA4B7FB,0x8718D09E,0x1ACFE827,0xA2738F42,0xB0C620AC,0x087A47C9, + 0xA032AF3E,0x188EC85B,0x0A3B67B5,0xB28700D0,0x2F503869,0x97EC5F0C,0x8559F0E2,0x3DE59787, + 0x658687D1,0xDD3AE0B4,0xCF8F4F5A,0x7733283F,0xEAE41086,0x525877E3,0x40EDD80D,0xF851BF68, + 0xF02BF8A1,0x48979FC4,0x5A22302A,0xE29E574F,0x7F496FF6,0xC7F50893,0xD540A77D,0x6DFCC018, + 0x359FD04E,0x8D23B72B,0x9F9618C5,0x272A7FA0,0xBAFD4719,0x0241207C,0x10F48F92,0xA848E8F7, + 0x9B14583D,0x23A83F58,0x311D90B6,0x89A1F7D3,0x1476CF6A,0xACCAA80F,0xBE7F07E1,0x06C36084, + 0x5EA070D2,0xE61C17B7,0xF4A9B859,0x4C15DF3C,0xD1C2E785,0x697E80E0,0x7BCB2F0E,0xC377486B, + 0xCB0D0FA2,0x73B168C7,0x6104C729,0xD9B8A04C,0x446F98F5,0xFCD3FF90,0xEE66507E,0x56DA371B, + 0x0EB9274D,0xB6054028,0xA4B0EFC6,0x1C0C88A3,0x81DBB01A,0x3967D77F,0x2BD27891,0x936E1FF4, + 0x3B26F703,0x839A9066,0x912F3F88,0x299358ED,0xB4446054,0x0CF80731,0x1E4DA8DF,0xA6F1CFBA, + 0xFE92DFEC,0x462EB889,0x549B1767,0xEC277002,0x71F048BB,0xC94C2FDE,0xDBF98030,0x6345E755, + 0x6B3FA09C,0xD383C7F9,0xC1366817,0x798A0F72,0xE45D37CB,0x5CE150AE,0x4E54FF40,0xF6E89825, + 0xAE8B8873,0x1637EF16,0x048240F8,0xBC3E279D,0x21E91F24,0x99557841,0x8BE0D7AF,0x335CB0CA, + 0xED59B63B,0x55E5D15E,0x47507EB0,0xFFEC19D5,0x623B216C,0xDA874609,0xC832E9E7,0x708E8E82, + 0x28ED9ED4,0x9051F9B1,0x82E4565F,0x3A58313A,0xA78F0983,0x1F336EE6,0x0D86C108,0xB53AA66D, + 0xBD40E1A4,0x05FC86C1,0x1749292F,0xAFF54E4A,0x322276F3,0x8A9E1196,0x982BBE78,0x2097D91D, + 0x78F4C94B,0xC048AE2E,0xD2FD01C0,0x6A4166A5,0xF7965E1C,0x4F2A3979,0x5D9F9697,0xE523F1F2, + 0x4D6B1905,0xF5D77E60,0xE762D18E,0x5FDEB6EB,0xC2098E52,0x7AB5E937,0x680046D9,0xD0BC21BC, + 0x88DF31EA,0x3063568F,0x22D6F961,0x9A6A9E04,0x07BDA6BD,0xBF01C1D8,0xADB46E36,0x15080953, + 0x1D724E9A,0xA5CE29FF,0xB77B8611,0x0FC7E174,0x9210D9CD,0x2AACBEA8,0x38191146,0x80A57623, + 0xD8C66675,0x607A0110,0x72CFAEFE,0xCA73C99B,0x57A4F122,0xEF189647,0xFDAD39A9,0x45115ECC, + 0x764DEE06,0xCEF18963,0xDC44268D,0x64F841E8,0xF92F7951,0x41931E34,0x5326B1DA,0xEB9AD6BF, + 0xB3F9C6E9,0x0B45A18C,0x19F00E62,0xA14C6907,0x3C9B51BE,0x842736DB,0x96929935,0x2E2EFE50, + 0x2654B999,0x9EE8DEFC,0x8C5D7112,0x34E11677,0xA9362ECE,0x118A49AB,0x033FE645,0xBB838120, + 0xE3E09176,0x5B5CF613,0x49E959FD,0xF1553E98,0x6C820621,0xD43E6144,0xC68BCEAA,0x7E37A9CF, + 0xD67F4138,0x6EC3265D,0x7C7689B3,0xC4CAEED6,0x591DD66F,0xE1A1B10A,0xF3141EE4,0x4BA87981, + 0x13CB69D7,0xAB770EB2,0xB9C2A15C,0x017EC639,0x9CA9FE80,0x241599E5,0x36A0360B,0x8E1C516E, + 0x866616A7,0x3EDA71C2,0x2C6FDE2C,0x94D3B949,0x090481F0,0xB1B8E695,0xA30D497B,0x1BB12E1E, + 0x43D23E48,0xFB6E592D,0xE9DBF6C3,0x516791A6,0xCCB0A91F,0x740CCE7A,0x66B96194,0xDE0506F1, + } +#endif // defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_4) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_8) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_16) +#if defined (CRC32_USE_LOOKUP_TABLE_SLICING_BY_8) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_16) + // beyond this point only relevant for Slicing-by-8 and Slicing-by-16 + ,{ + 0x00000000,0x3D6029B0,0x7AC05360,0x47A07AD0,0xF580A6C0,0xC8E08F70,0x8F40F5A0,0xB220DC10, + 0x30704BC1,0x0D106271,0x4AB018A1,0x77D03111,0xC5F0ED01,0xF890C4B1,0xBF30BE61,0x825097D1, + 0x60E09782,0x5D80BE32,0x1A20C4E2,0x2740ED52,0x95603142,0xA80018F2,0xEFA06222,0xD2C04B92, + 0x5090DC43,0x6DF0F5F3,0x2A508F23,0x1730A693,0xA5107A83,0x98705333,0xDFD029E3,0xE2B00053, + 0xC1C12F04,0xFCA106B4,0xBB017C64,0x866155D4,0x344189C4,0x0921A074,0x4E81DAA4,0x73E1F314, + 0xF1B164C5,0xCCD14D75,0x8B7137A5,0xB6111E15,0x0431C205,0x3951EBB5,0x7EF19165,0x4391B8D5, + 0xA121B886,0x9C419136,0xDBE1EBE6,0xE681C256,0x54A11E46,0x69C137F6,0x2E614D26,0x13016496, + 0x9151F347,0xAC31DAF7,0xEB91A027,0xD6F18997,0x64D15587,0x59B17C37,0x1E1106E7,0x23712F57, + 0x58F35849,0x659371F9,0x22330B29,0x1F532299,0xAD73FE89,0x9013D739,0xD7B3ADE9,0xEAD38459, + 0x68831388,0x55E33A38,0x124340E8,0x2F236958,0x9D03B548,0xA0639CF8,0xE7C3E628,0xDAA3CF98, + 0x3813CFCB,0x0573E67B,0x42D39CAB,0x7FB3B51B,0xCD93690B,0xF0F340BB,0xB7533A6B,0x8A3313DB, + 0x0863840A,0x3503ADBA,0x72A3D76A,0x4FC3FEDA,0xFDE322CA,0xC0830B7A,0x872371AA,0xBA43581A, + 0x9932774D,0xA4525EFD,0xE3F2242D,0xDE920D9D,0x6CB2D18D,0x51D2F83D,0x167282ED,0x2B12AB5D, + 0xA9423C8C,0x9422153C,0xD3826FEC,0xEEE2465C,0x5CC29A4C,0x61A2B3FC,0x2602C92C,0x1B62E09C, + 0xF9D2E0CF,0xC4B2C97F,0x8312B3AF,0xBE729A1F,0x0C52460F,0x31326FBF,0x7692156F,0x4BF23CDF, + 0xC9A2AB0E,0xF4C282BE,0xB362F86E,0x8E02D1DE,0x3C220DCE,0x0142247E,0x46E25EAE,0x7B82771E, + 0xB1E6B092,0x8C869922,0xCB26E3F2,0xF646CA42,0x44661652,0x79063FE2,0x3EA64532,0x03C66C82, + 0x8196FB53,0xBCF6D2E3,0xFB56A833,0xC6368183,0x74165D93,0x49767423,0x0ED60EF3,0x33B62743, + 0xD1062710,0xEC660EA0,0xABC67470,0x96A65DC0,0x248681D0,0x19E6A860,0x5E46D2B0,0x6326FB00, + 0xE1766CD1,0xDC164561,0x9BB63FB1,0xA6D61601,0x14F6CA11,0x2996E3A1,0x6E369971,0x5356B0C1, + 0x70279F96,0x4D47B626,0x0AE7CCF6,0x3787E546,0x85A73956,0xB8C710E6,0xFF676A36,0xC2074386, + 0x4057D457,0x7D37FDE7,0x3A978737,0x07F7AE87,0xB5D77297,0x88B75B27,0xCF1721F7,0xF2770847, + 0x10C70814,0x2DA721A4,0x6A075B74,0x576772C4,0xE547AED4,0xD8278764,0x9F87FDB4,0xA2E7D404, + 0x20B743D5,0x1DD76A65,0x5A7710B5,0x67173905,0xD537E515,0xE857CCA5,0xAFF7B675,0x92979FC5, + 0xE915E8DB,0xD475C16B,0x93D5BBBB,0xAEB5920B,0x1C954E1B,0x21F567AB,0x66551D7B,0x5B3534CB, + 0xD965A31A,0xE4058AAA,0xA3A5F07A,0x9EC5D9CA,0x2CE505DA,0x11852C6A,0x562556BA,0x6B457F0A, + 0x89F57F59,0xB49556E9,0xF3352C39,0xCE550589,0x7C75D999,0x4115F029,0x06B58AF9,0x3BD5A349, + 0xB9853498,0x84E51D28,0xC34567F8,0xFE254E48,0x4C059258,0x7165BBE8,0x36C5C138,0x0BA5E888, + 0x28D4C7DF,0x15B4EE6F,0x521494BF,0x6F74BD0F,0xDD54611F,0xE03448AF,0xA794327F,0x9AF41BCF, + 0x18A48C1E,0x25C4A5AE,0x6264DF7E,0x5F04F6CE,0xED242ADE,0xD044036E,0x97E479BE,0xAA84500E, + 0x4834505D,0x755479ED,0x32F4033D,0x0F942A8D,0xBDB4F69D,0x80D4DF2D,0xC774A5FD,0xFA148C4D, + 0x78441B9C,0x4524322C,0x028448FC,0x3FE4614C,0x8DC4BD5C,0xB0A494EC,0xF704EE3C,0xCA64C78C, + }, + + { + 0x00000000,0xCB5CD3A5,0x4DC8A10B,0x869472AE,0x9B914216,0x50CD91B3,0xD659E31D,0x1D0530B8, + 0xEC53826D,0x270F51C8,0xA19B2366,0x6AC7F0C3,0x77C2C07B,0xBC9E13DE,0x3A0A6170,0xF156B2D5, + 0x03D6029B,0xC88AD13E,0x4E1EA390,0x85427035,0x9847408D,0x531B9328,0xD58FE186,0x1ED33223, + 0xEF8580F6,0x24D95353,0xA24D21FD,0x6911F258,0x7414C2E0,0xBF481145,0x39DC63EB,0xF280B04E, + 0x07AC0536,0xCCF0D693,0x4A64A43D,0x81387798,0x9C3D4720,0x57619485,0xD1F5E62B,0x1AA9358E, + 0xEBFF875B,0x20A354FE,0xA6372650,0x6D6BF5F5,0x706EC54D,0xBB3216E8,0x3DA66446,0xF6FAB7E3, + 0x047A07AD,0xCF26D408,0x49B2A6A6,0x82EE7503,0x9FEB45BB,0x54B7961E,0xD223E4B0,0x197F3715, + 0xE82985C0,0x23755665,0xA5E124CB,0x6EBDF76E,0x73B8C7D6,0xB8E41473,0x3E7066DD,0xF52CB578, + 0x0F580A6C,0xC404D9C9,0x4290AB67,0x89CC78C2,0x94C9487A,0x5F959BDF,0xD901E971,0x125D3AD4, + 0xE30B8801,0x28575BA4,0xAEC3290A,0x659FFAAF,0x789ACA17,0xB3C619B2,0x35526B1C,0xFE0EB8B9, + 0x0C8E08F7,0xC7D2DB52,0x4146A9FC,0x8A1A7A59,0x971F4AE1,0x5C439944,0xDAD7EBEA,0x118B384F, + 0xE0DD8A9A,0x2B81593F,0xAD152B91,0x6649F834,0x7B4CC88C,0xB0101B29,0x36846987,0xFDD8BA22, + 0x08F40F5A,0xC3A8DCFF,0x453CAE51,0x8E607DF4,0x93654D4C,0x58399EE9,0xDEADEC47,0x15F13FE2, + 0xE4A78D37,0x2FFB5E92,0xA96F2C3C,0x6233FF99,0x7F36CF21,0xB46A1C84,0x32FE6E2A,0xF9A2BD8F, + 0x0B220DC1,0xC07EDE64,0x46EAACCA,0x8DB67F6F,0x90B34FD7,0x5BEF9C72,0xDD7BEEDC,0x16273D79, + 0xE7718FAC,0x2C2D5C09,0xAAB92EA7,0x61E5FD02,0x7CE0CDBA,0xB7BC1E1F,0x31286CB1,0xFA74BF14, + 0x1EB014D8,0xD5ECC77D,0x5378B5D3,0x98246676,0x852156CE,0x4E7D856B,0xC8E9F7C5,0x03B52460, + 0xF2E396B5,0x39BF4510,0xBF2B37BE,0x7477E41B,0x6972D4A3,0xA22E0706,0x24BA75A8,0xEFE6A60D, + 0x1D661643,0xD63AC5E6,0x50AEB748,0x9BF264ED,0x86F75455,0x4DAB87F0,0xCB3FF55E,0x006326FB, + 0xF135942E,0x3A69478B,0xBCFD3525,0x77A1E680,0x6AA4D638,0xA1F8059D,0x276C7733,0xEC30A496, + 0x191C11EE,0xD240C24B,0x54D4B0E5,0x9F886340,0x828D53F8,0x49D1805D,0xCF45F2F3,0x04192156, + 0xF54F9383,0x3E134026,0xB8873288,0x73DBE12D,0x6EDED195,0xA5820230,0x2316709E,0xE84AA33B, + 0x1ACA1375,0xD196C0D0,0x5702B27E,0x9C5E61DB,0x815B5163,0x4A0782C6,0xCC93F068,0x07CF23CD, + 0xF6999118,0x3DC542BD,0xBB513013,0x700DE3B6,0x6D08D30E,0xA65400AB,0x20C07205,0xEB9CA1A0, + 0x11E81EB4,0xDAB4CD11,0x5C20BFBF,0x977C6C1A,0x8A795CA2,0x41258F07,0xC7B1FDA9,0x0CED2E0C, + 0xFDBB9CD9,0x36E74F7C,0xB0733DD2,0x7B2FEE77,0x662ADECF,0xAD760D6A,0x2BE27FC4,0xE0BEAC61, + 0x123E1C2F,0xD962CF8A,0x5FF6BD24,0x94AA6E81,0x89AF5E39,0x42F38D9C,0xC467FF32,0x0F3B2C97, + 0xFE6D9E42,0x35314DE7,0xB3A53F49,0x78F9ECEC,0x65FCDC54,0xAEA00FF1,0x28347D5F,0xE368AEFA, + 0x16441B82,0xDD18C827,0x5B8CBA89,0x90D0692C,0x8DD55994,0x46898A31,0xC01DF89F,0x0B412B3A, + 0xFA1799EF,0x314B4A4A,0xB7DF38E4,0x7C83EB41,0x6186DBF9,0xAADA085C,0x2C4E7AF2,0xE712A957, + 0x15921919,0xDECECABC,0x585AB812,0x93066BB7,0x8E035B0F,0x455F88AA,0xC3CBFA04,0x089729A1, + 0xF9C19B74,0x329D48D1,0xB4093A7F,0x7F55E9DA,0x6250D962,0xA90C0AC7,0x2F987869,0xE4C4ABCC, + }, + + { + 0x00000000,0xA6770BB4,0x979F1129,0x31E81A9D,0xF44F2413,0x52382FA7,0x63D0353A,0xC5A73E8E, + 0x33EF4E67,0x959845D3,0xA4705F4E,0x020754FA,0xC7A06A74,0x61D761C0,0x503F7B5D,0xF64870E9, + 0x67DE9CCE,0xC1A9977A,0xF0418DE7,0x56368653,0x9391B8DD,0x35E6B369,0x040EA9F4,0xA279A240, + 0x5431D2A9,0xF246D91D,0xC3AEC380,0x65D9C834,0xA07EF6BA,0x0609FD0E,0x37E1E793,0x9196EC27, + 0xCFBD399C,0x69CA3228,0x582228B5,0xFE552301,0x3BF21D8F,0x9D85163B,0xAC6D0CA6,0x0A1A0712, + 0xFC5277FB,0x5A257C4F,0x6BCD66D2,0xCDBA6D66,0x081D53E8,0xAE6A585C,0x9F8242C1,0x39F54975, + 0xA863A552,0x0E14AEE6,0x3FFCB47B,0x998BBFCF,0x5C2C8141,0xFA5B8AF5,0xCBB39068,0x6DC49BDC, + 0x9B8CEB35,0x3DFBE081,0x0C13FA1C,0xAA64F1A8,0x6FC3CF26,0xC9B4C492,0xF85CDE0F,0x5E2BD5BB, + 0x440B7579,0xE27C7ECD,0xD3946450,0x75E36FE4,0xB044516A,0x16335ADE,0x27DB4043,0x81AC4BF7, + 0x77E43B1E,0xD19330AA,0xE07B2A37,0x460C2183,0x83AB1F0D,0x25DC14B9,0x14340E24,0xB2430590, + 0x23D5E9B7,0x85A2E203,0xB44AF89E,0x123DF32A,0xD79ACDA4,0x71EDC610,0x4005DC8D,0xE672D739, + 0x103AA7D0,0xB64DAC64,0x87A5B6F9,0x21D2BD4D,0xE47583C3,0x42028877,0x73EA92EA,0xD59D995E, + 0x8BB64CE5,0x2DC14751,0x1C295DCC,0xBA5E5678,0x7FF968F6,0xD98E6342,0xE86679DF,0x4E11726B, + 0xB8590282,0x1E2E0936,0x2FC613AB,0x89B1181F,0x4C162691,0xEA612D25,0xDB8937B8,0x7DFE3C0C, + 0xEC68D02B,0x4A1FDB9F,0x7BF7C102,0xDD80CAB6,0x1827F438,0xBE50FF8C,0x8FB8E511,0x29CFEEA5, + 0xDF879E4C,0x79F095F8,0x48188F65,0xEE6F84D1,0x2BC8BA5F,0x8DBFB1EB,0xBC57AB76,0x1A20A0C2, + 0x8816EAF2,0x2E61E146,0x1F89FBDB,0xB9FEF06F,0x7C59CEE1,0xDA2EC555,0xEBC6DFC8,0x4DB1D47C, + 0xBBF9A495,0x1D8EAF21,0x2C66B5BC,0x8A11BE08,0x4FB68086,0xE9C18B32,0xD82991AF,0x7E5E9A1B, + 0xEFC8763C,0x49BF7D88,0x78576715,0xDE206CA1,0x1B87522F,0xBDF0599B,0x8C184306,0x2A6F48B2, + 0xDC27385B,0x7A5033EF,0x4BB82972,0xEDCF22C6,0x28681C48,0x8E1F17FC,0xBFF70D61,0x198006D5, + 0x47ABD36E,0xE1DCD8DA,0xD034C247,0x7643C9F3,0xB3E4F77D,0x1593FCC9,0x247BE654,0x820CEDE0, + 0x74449D09,0xD23396BD,0xE3DB8C20,0x45AC8794,0x800BB91A,0x267CB2AE,0x1794A833,0xB1E3A387, + 0x20754FA0,0x86024414,0xB7EA5E89,0x119D553D,0xD43A6BB3,0x724D6007,0x43A57A9A,0xE5D2712E, + 0x139A01C7,0xB5ED0A73,0x840510EE,0x22721B5A,0xE7D525D4,0x41A22E60,0x704A34FD,0xD63D3F49, + 0xCC1D9F8B,0x6A6A943F,0x5B828EA2,0xFDF58516,0x3852BB98,0x9E25B02C,0xAFCDAAB1,0x09BAA105, + 0xFFF2D1EC,0x5985DA58,0x686DC0C5,0xCE1ACB71,0x0BBDF5FF,0xADCAFE4B,0x9C22E4D6,0x3A55EF62, + 0xABC30345,0x0DB408F1,0x3C5C126C,0x9A2B19D8,0x5F8C2756,0xF9FB2CE2,0xC813367F,0x6E643DCB, + 0x982C4D22,0x3E5B4696,0x0FB35C0B,0xA9C457BF,0x6C636931,0xCA146285,0xFBFC7818,0x5D8B73AC, + 0x03A0A617,0xA5D7ADA3,0x943FB73E,0x3248BC8A,0xF7EF8204,0x519889B0,0x6070932D,0xC6079899, + 0x304FE870,0x9638E3C4,0xA7D0F959,0x01A7F2ED,0xC400CC63,0x6277C7D7,0x539FDD4A,0xF5E8D6FE, + 0x647E3AD9,0xC209316D,0xF3E12BF0,0x55962044,0x90311ECA,0x3646157E,0x07AE0FE3,0xA1D90457, + 0x579174BE,0xF1E67F0A,0xC00E6597,0x66796E23,0xA3DE50AD,0x05A95B19,0x34414184,0x92364A30, + }, + + { + 0x00000000,0xCCAA009E,0x4225077D,0x8E8F07E3,0x844A0EFA,0x48E00E64,0xC66F0987,0x0AC50919, + 0xD3E51BB5,0x1F4F1B2B,0x91C01CC8,0x5D6A1C56,0x57AF154F,0x9B0515D1,0x158A1232,0xD92012AC, + 0x7CBB312B,0xB01131B5,0x3E9E3656,0xF23436C8,0xF8F13FD1,0x345B3F4F,0xBAD438AC,0x767E3832, + 0xAF5E2A9E,0x63F42A00,0xED7B2DE3,0x21D12D7D,0x2B142464,0xE7BE24FA,0x69312319,0xA59B2387, + 0xF9766256,0x35DC62C8,0xBB53652B,0x77F965B5,0x7D3C6CAC,0xB1966C32,0x3F196BD1,0xF3B36B4F, + 0x2A9379E3,0xE639797D,0x68B67E9E,0xA41C7E00,0xAED97719,0x62737787,0xECFC7064,0x205670FA, + 0x85CD537D,0x496753E3,0xC7E85400,0x0B42549E,0x01875D87,0xCD2D5D19,0x43A25AFA,0x8F085A64, + 0x562848C8,0x9A824856,0x140D4FB5,0xD8A74F2B,0xD2624632,0x1EC846AC,0x9047414F,0x5CED41D1, + 0x299DC2ED,0xE537C273,0x6BB8C590,0xA712C50E,0xADD7CC17,0x617DCC89,0xEFF2CB6A,0x2358CBF4, + 0xFA78D958,0x36D2D9C6,0xB85DDE25,0x74F7DEBB,0x7E32D7A2,0xB298D73C,0x3C17D0DF,0xF0BDD041, + 0x5526F3C6,0x998CF358,0x1703F4BB,0xDBA9F425,0xD16CFD3C,0x1DC6FDA2,0x9349FA41,0x5FE3FADF, + 0x86C3E873,0x4A69E8ED,0xC4E6EF0E,0x084CEF90,0x0289E689,0xCE23E617,0x40ACE1F4,0x8C06E16A, + 0xD0EBA0BB,0x1C41A025,0x92CEA7C6,0x5E64A758,0x54A1AE41,0x980BAEDF,0x1684A93C,0xDA2EA9A2, + 0x030EBB0E,0xCFA4BB90,0x412BBC73,0x8D81BCED,0x8744B5F4,0x4BEEB56A,0xC561B289,0x09CBB217, + 0xAC509190,0x60FA910E,0xEE7596ED,0x22DF9673,0x281A9F6A,0xE4B09FF4,0x6A3F9817,0xA6959889, + 0x7FB58A25,0xB31F8ABB,0x3D908D58,0xF13A8DC6,0xFBFF84DF,0x37558441,0xB9DA83A2,0x7570833C, + 0x533B85DA,0x9F918544,0x111E82A7,0xDDB48239,0xD7718B20,0x1BDB8BBE,0x95548C5D,0x59FE8CC3, + 0x80DE9E6F,0x4C749EF1,0xC2FB9912,0x0E51998C,0x04949095,0xC83E900B,0x46B197E8,0x8A1B9776, + 0x2F80B4F1,0xE32AB46F,0x6DA5B38C,0xA10FB312,0xABCABA0B,0x6760BA95,0xE9EFBD76,0x2545BDE8, + 0xFC65AF44,0x30CFAFDA,0xBE40A839,0x72EAA8A7,0x782FA1BE,0xB485A120,0x3A0AA6C3,0xF6A0A65D, + 0xAA4DE78C,0x66E7E712,0xE868E0F1,0x24C2E06F,0x2E07E976,0xE2ADE9E8,0x6C22EE0B,0xA088EE95, + 0x79A8FC39,0xB502FCA7,0x3B8DFB44,0xF727FBDA,0xFDE2F2C3,0x3148F25D,0xBFC7F5BE,0x736DF520, + 0xD6F6D6A7,0x1A5CD639,0x94D3D1DA,0x5879D144,0x52BCD85D,0x9E16D8C3,0x1099DF20,0xDC33DFBE, + 0x0513CD12,0xC9B9CD8C,0x4736CA6F,0x8B9CCAF1,0x8159C3E8,0x4DF3C376,0xC37CC495,0x0FD6C40B, + 0x7AA64737,0xB60C47A9,0x3883404A,0xF42940D4,0xFEEC49CD,0x32464953,0xBCC94EB0,0x70634E2E, + 0xA9435C82,0x65E95C1C,0xEB665BFF,0x27CC5B61,0x2D095278,0xE1A352E6,0x6F2C5505,0xA386559B, + 0x061D761C,0xCAB77682,0x44387161,0x889271FF,0x825778E6,0x4EFD7878,0xC0727F9B,0x0CD87F05, + 0xD5F86DA9,0x19526D37,0x97DD6AD4,0x5B776A4A,0x51B26353,0x9D1863CD,0x1397642E,0xDF3D64B0, + 0x83D02561,0x4F7A25FF,0xC1F5221C,0x0D5F2282,0x079A2B9B,0xCB302B05,0x45BF2CE6,0x89152C78, + 0x50353ED4,0x9C9F3E4A,0x121039A9,0xDEBA3937,0xD47F302E,0x18D530B0,0x965A3753,0x5AF037CD, + 0xFF6B144A,0x33C114D4,0xBD4E1337,0x71E413A9,0x7B211AB0,0xB78B1A2E,0x39041DCD,0xF5AE1D53, + 0x2C8E0FFF,0xE0240F61,0x6EAB0882,0xA201081C,0xA8C40105,0x646E019B,0xEAE10678,0x264B06E6, + } +#endif // CRC32_USE_LOOKUP_TABLE_SLICING_BY_8 || CRC32_USE_LOOKUP_TABLE_SLICING_BY_16 +#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16 + // beyond this point only relevant for Slicing-by-16 + ,{ + 0x00000000,0x177B1443,0x2EF62886,0x398D3CC5,0x5DEC510C,0x4A97454F,0x731A798A,0x64616DC9, + 0xBBD8A218,0xACA3B65B,0x952E8A9E,0x82559EDD,0xE634F314,0xF14FE757,0xC8C2DB92,0xDFB9CFD1, + 0xACC04271,0xBBBB5632,0x82366AF7,0x954D7EB4,0xF12C137D,0xE657073E,0xDFDA3BFB,0xC8A12FB8, + 0x1718E069,0x0063F42A,0x39EEC8EF,0x2E95DCAC,0x4AF4B165,0x5D8FA526,0x640299E3,0x73798DA0, + 0x82F182A3,0x958A96E0,0xAC07AA25,0xBB7CBE66,0xDF1DD3AF,0xC866C7EC,0xF1EBFB29,0xE690EF6A, + 0x392920BB,0x2E5234F8,0x17DF083D,0x00A41C7E,0x64C571B7,0x73BE65F4,0x4A335931,0x5D484D72, + 0x2E31C0D2,0x394AD491,0x00C7E854,0x17BCFC17,0x73DD91DE,0x64A6859D,0x5D2BB958,0x4A50AD1B, + 0x95E962CA,0x82927689,0xBB1F4A4C,0xAC645E0F,0xC80533C6,0xDF7E2785,0xE6F31B40,0xF1880F03, + 0xDE920307,0xC9E91744,0xF0642B81,0xE71F3FC2,0x837E520B,0x94054648,0xAD887A8D,0xBAF36ECE, + 0x654AA11F,0x7231B55C,0x4BBC8999,0x5CC79DDA,0x38A6F013,0x2FDDE450,0x1650D895,0x012BCCD6, + 0x72524176,0x65295535,0x5CA469F0,0x4BDF7DB3,0x2FBE107A,0x38C50439,0x014838FC,0x16332CBF, + 0xC98AE36E,0xDEF1F72D,0xE77CCBE8,0xF007DFAB,0x9466B262,0x831DA621,0xBA909AE4,0xADEB8EA7, + 0x5C6381A4,0x4B1895E7,0x7295A922,0x65EEBD61,0x018FD0A8,0x16F4C4EB,0x2F79F82E,0x3802EC6D, + 0xE7BB23BC,0xF0C037FF,0xC94D0B3A,0xDE361F79,0xBA5772B0,0xAD2C66F3,0x94A15A36,0x83DA4E75, + 0xF0A3C3D5,0xE7D8D796,0xDE55EB53,0xC92EFF10,0xAD4F92D9,0xBA34869A,0x83B9BA5F,0x94C2AE1C, + 0x4B7B61CD,0x5C00758E,0x658D494B,0x72F65D08,0x169730C1,0x01EC2482,0x38611847,0x2F1A0C04, + 0x6655004F,0x712E140C,0x48A328C9,0x5FD83C8A,0x3BB95143,0x2CC24500,0x154F79C5,0x02346D86, + 0xDD8DA257,0xCAF6B614,0xF37B8AD1,0xE4009E92,0x8061F35B,0x971AE718,0xAE97DBDD,0xB9ECCF9E, + 0xCA95423E,0xDDEE567D,0xE4636AB8,0xF3187EFB,0x97791332,0x80020771,0xB98F3BB4,0xAEF42FF7, + 0x714DE026,0x6636F465,0x5FBBC8A0,0x48C0DCE3,0x2CA1B12A,0x3BDAA569,0x025799AC,0x152C8DEF, + 0xE4A482EC,0xF3DF96AF,0xCA52AA6A,0xDD29BE29,0xB948D3E0,0xAE33C7A3,0x97BEFB66,0x80C5EF25, + 0x5F7C20F4,0x480734B7,0x718A0872,0x66F11C31,0x029071F8,0x15EB65BB,0x2C66597E,0x3B1D4D3D, + 0x4864C09D,0x5F1FD4DE,0x6692E81B,0x71E9FC58,0x15889191,0x02F385D2,0x3B7EB917,0x2C05AD54, + 0xF3BC6285,0xE4C776C6,0xDD4A4A03,0xCA315E40,0xAE503389,0xB92B27CA,0x80A61B0F,0x97DD0F4C, + 0xB8C70348,0xAFBC170B,0x96312BCE,0x814A3F8D,0xE52B5244,0xF2504607,0xCBDD7AC2,0xDCA66E81, + 0x031FA150,0x1464B513,0x2DE989D6,0x3A929D95,0x5EF3F05C,0x4988E41F,0x7005D8DA,0x677ECC99, + 0x14074139,0x037C557A,0x3AF169BF,0x2D8A7DFC,0x49EB1035,0x5E900476,0x671D38B3,0x70662CF0, + 0xAFDFE321,0xB8A4F762,0x8129CBA7,0x9652DFE4,0xF233B22D,0xE548A66E,0xDCC59AAB,0xCBBE8EE8, + 0x3A3681EB,0x2D4D95A8,0x14C0A96D,0x03BBBD2E,0x67DAD0E7,0x70A1C4A4,0x492CF861,0x5E57EC22, + 0x81EE23F3,0x969537B0,0xAF180B75,0xB8631F36,0xDC0272FF,0xCB7966BC,0xF2F45A79,0xE58F4E3A, + 0x96F6C39A,0x818DD7D9,0xB800EB1C,0xAF7BFF5F,0xCB1A9296,0xDC6186D5,0xE5ECBA10,0xF297AE53, + 0x2D2E6182,0x3A5575C1,0x03D84904,0x14A35D47,0x70C2308E,0x67B924CD,0x5E341808,0x494F0C4B, + }, + + { + 0x00000000,0xEFC26B3E,0x04F5D03D,0xEB37BB03,0x09EBA07A,0xE629CB44,0x0D1E7047,0xE2DC1B79, + 0x13D740F4,0xFC152BCA,0x172290C9,0xF8E0FBF7,0x1A3CE08E,0xF5FE8BB0,0x1EC930B3,0xF10B5B8D, + 0x27AE81E8,0xC86CEAD6,0x235B51D5,0xCC993AEB,0x2E452192,0xC1874AAC,0x2AB0F1AF,0xC5729A91, + 0x3479C11C,0xDBBBAA22,0x308C1121,0xDF4E7A1F,0x3D926166,0xD2500A58,0x3967B15B,0xD6A5DA65, + 0x4F5D03D0,0xA09F68EE,0x4BA8D3ED,0xA46AB8D3,0x46B6A3AA,0xA974C894,0x42437397,0xAD8118A9, + 0x5C8A4324,0xB348281A,0x587F9319,0xB7BDF827,0x5561E35E,0xBAA38860,0x51943363,0xBE56585D, + 0x68F38238,0x8731E906,0x6C065205,0x83C4393B,0x61182242,0x8EDA497C,0x65EDF27F,0x8A2F9941, + 0x7B24C2CC,0x94E6A9F2,0x7FD112F1,0x901379CF,0x72CF62B6,0x9D0D0988,0x763AB28B,0x99F8D9B5, + 0x9EBA07A0,0x71786C9E,0x9A4FD79D,0x758DBCA3,0x9751A7DA,0x7893CCE4,0x93A477E7,0x7C661CD9, + 0x8D6D4754,0x62AF2C6A,0x89989769,0x665AFC57,0x8486E72E,0x6B448C10,0x80733713,0x6FB15C2D, + 0xB9148648,0x56D6ED76,0xBDE15675,0x52233D4B,0xB0FF2632,0x5F3D4D0C,0xB40AF60F,0x5BC89D31, + 0xAAC3C6BC,0x4501AD82,0xAE361681,0x41F47DBF,0xA32866C6,0x4CEA0DF8,0xA7DDB6FB,0x481FDDC5, + 0xD1E70470,0x3E256F4E,0xD512D44D,0x3AD0BF73,0xD80CA40A,0x37CECF34,0xDCF97437,0x333B1F09, + 0xC2304484,0x2DF22FBA,0xC6C594B9,0x2907FF87,0xCBDBE4FE,0x24198FC0,0xCF2E34C3,0x20EC5FFD, + 0xF6498598,0x198BEEA6,0xF2BC55A5,0x1D7E3E9B,0xFFA225E2,0x10604EDC,0xFB57F5DF,0x14959EE1, + 0xE59EC56C,0x0A5CAE52,0xE16B1551,0x0EA97E6F,0xEC756516,0x03B70E28,0xE880B52B,0x0742DE15, + 0xE6050901,0x09C7623F,0xE2F0D93C,0x0D32B202,0xEFEEA97B,0x002CC245,0xEB1B7946,0x04D91278, + 0xF5D249F5,0x1A1022CB,0xF12799C8,0x1EE5F2F6,0xFC39E98F,0x13FB82B1,0xF8CC39B2,0x170E528C, + 0xC1AB88E9,0x2E69E3D7,0xC55E58D4,0x2A9C33EA,0xC8402893,0x278243AD,0xCCB5F8AE,0x23779390, + 0xD27CC81D,0x3DBEA323,0xD6891820,0x394B731E,0xDB976867,0x34550359,0xDF62B85A,0x30A0D364, + 0xA9580AD1,0x469A61EF,0xADADDAEC,0x426FB1D2,0xA0B3AAAB,0x4F71C195,0xA4467A96,0x4B8411A8, + 0xBA8F4A25,0x554D211B,0xBE7A9A18,0x51B8F126,0xB364EA5F,0x5CA68161,0xB7913A62,0x5853515C, + 0x8EF68B39,0x6134E007,0x8A035B04,0x65C1303A,0x871D2B43,0x68DF407D,0x83E8FB7E,0x6C2A9040, + 0x9D21CBCD,0x72E3A0F3,0x99D41BF0,0x761670CE,0x94CA6BB7,0x7B080089,0x903FBB8A,0x7FFDD0B4, + 0x78BF0EA1,0x977D659F,0x7C4ADE9C,0x9388B5A2,0x7154AEDB,0x9E96C5E5,0x75A17EE6,0x9A6315D8, + 0x6B684E55,0x84AA256B,0x6F9D9E68,0x805FF556,0x6283EE2F,0x8D418511,0x66763E12,0x89B4552C, + 0x5F118F49,0xB0D3E477,0x5BE45F74,0xB426344A,0x56FA2F33,0xB938440D,0x520FFF0E,0xBDCD9430, + 0x4CC6CFBD,0xA304A483,0x48331F80,0xA7F174BE,0x452D6FC7,0xAAEF04F9,0x41D8BFFA,0xAE1AD4C4, + 0x37E20D71,0xD820664F,0x3317DD4C,0xDCD5B672,0x3E09AD0B,0xD1CBC635,0x3AFC7D36,0xD53E1608, + 0x24354D85,0xCBF726BB,0x20C09DB8,0xCF02F686,0x2DDEEDFF,0xC21C86C1,0x292B3DC2,0xC6E956FC, + 0x104C8C99,0xFF8EE7A7,0x14B95CA4,0xFB7B379A,0x19A72CE3,0xF66547DD,0x1D52FCDE,0xF29097E0, + 0x039BCC6D,0xEC59A753,0x076E1C50,0xE8AC776E,0x0A706C17,0xE5B20729,0x0E85BC2A,0xE147D714, + }, + + { + 0x00000000,0xC18EDFC0,0x586CB9C1,0x99E26601,0xB0D97382,0x7157AC42,0xE8B5CA43,0x293B1583, + 0xBAC3E145,0x7B4D3E85,0xE2AF5884,0x23218744,0x0A1A92C7,0xCB944D07,0x52762B06,0x93F8F4C6, + 0xAEF6C4CB,0x6F781B0B,0xF69A7D0A,0x3714A2CA,0x1E2FB749,0xDFA16889,0x46430E88,0x87CDD148, + 0x1435258E,0xD5BBFA4E,0x4C599C4F,0x8DD7438F,0xA4EC560C,0x656289CC,0xFC80EFCD,0x3D0E300D, + 0x869C8FD7,0x47125017,0xDEF03616,0x1F7EE9D6,0x3645FC55,0xF7CB2395,0x6E294594,0xAFA79A54, + 0x3C5F6E92,0xFDD1B152,0x6433D753,0xA5BD0893,0x8C861D10,0x4D08C2D0,0xD4EAA4D1,0x15647B11, + 0x286A4B1C,0xE9E494DC,0x7006F2DD,0xB1882D1D,0x98B3389E,0x593DE75E,0xC0DF815F,0x01515E9F, + 0x92A9AA59,0x53277599,0xCAC51398,0x0B4BCC58,0x2270D9DB,0xE3FE061B,0x7A1C601A,0xBB92BFDA, + 0xD64819EF,0x17C6C62F,0x8E24A02E,0x4FAA7FEE,0x66916A6D,0xA71FB5AD,0x3EFDD3AC,0xFF730C6C, + 0x6C8BF8AA,0xAD05276A,0x34E7416B,0xF5699EAB,0xDC528B28,0x1DDC54E8,0x843E32E9,0x45B0ED29, + 0x78BEDD24,0xB93002E4,0x20D264E5,0xE15CBB25,0xC867AEA6,0x09E97166,0x900B1767,0x5185C8A7, + 0xC27D3C61,0x03F3E3A1,0x9A1185A0,0x5B9F5A60,0x72A44FE3,0xB32A9023,0x2AC8F622,0xEB4629E2, + 0x50D49638,0x915A49F8,0x08B82FF9,0xC936F039,0xE00DE5BA,0x21833A7A,0xB8615C7B,0x79EF83BB, + 0xEA17777D,0x2B99A8BD,0xB27BCEBC,0x73F5117C,0x5ACE04FF,0x9B40DB3F,0x02A2BD3E,0xC32C62FE, + 0xFE2252F3,0x3FAC8D33,0xA64EEB32,0x67C034F2,0x4EFB2171,0x8F75FEB1,0x169798B0,0xD7194770, + 0x44E1B3B6,0x856F6C76,0x1C8D0A77,0xDD03D5B7,0xF438C034,0x35B61FF4,0xAC5479F5,0x6DDAA635, + 0x77E1359F,0xB66FEA5F,0x2F8D8C5E,0xEE03539E,0xC738461D,0x06B699DD,0x9F54FFDC,0x5EDA201C, + 0xCD22D4DA,0x0CAC0B1A,0x954E6D1B,0x54C0B2DB,0x7DFBA758,0xBC757898,0x25971E99,0xE419C159, + 0xD917F154,0x18992E94,0x817B4895,0x40F59755,0x69CE82D6,0xA8405D16,0x31A23B17,0xF02CE4D7, + 0x63D41011,0xA25ACFD1,0x3BB8A9D0,0xFA367610,0xD30D6393,0x1283BC53,0x8B61DA52,0x4AEF0592, + 0xF17DBA48,0x30F36588,0xA9110389,0x689FDC49,0x41A4C9CA,0x802A160A,0x19C8700B,0xD846AFCB, + 0x4BBE5B0D,0x8A3084CD,0x13D2E2CC,0xD25C3D0C,0xFB67288F,0x3AE9F74F,0xA30B914E,0x62854E8E, + 0x5F8B7E83,0x9E05A143,0x07E7C742,0xC6691882,0xEF520D01,0x2EDCD2C1,0xB73EB4C0,0x76B06B00, + 0xE5489FC6,0x24C64006,0xBD242607,0x7CAAF9C7,0x5591EC44,0x941F3384,0x0DFD5585,0xCC738A45, + 0xA1A92C70,0x6027F3B0,0xF9C595B1,0x384B4A71,0x11705FF2,0xD0FE8032,0x491CE633,0x889239F3, + 0x1B6ACD35,0xDAE412F5,0x430674F4,0x8288AB34,0xABB3BEB7,0x6A3D6177,0xF3DF0776,0x3251D8B6, + 0x0F5FE8BB,0xCED1377B,0x5733517A,0x96BD8EBA,0xBF869B39,0x7E0844F9,0xE7EA22F8,0x2664FD38, + 0xB59C09FE,0x7412D63E,0xEDF0B03F,0x2C7E6FFF,0x05457A7C,0xC4CBA5BC,0x5D29C3BD,0x9CA71C7D, + 0x2735A3A7,0xE6BB7C67,0x7F591A66,0xBED7C5A6,0x97ECD025,0x56620FE5,0xCF8069E4,0x0E0EB624, + 0x9DF642E2,0x5C789D22,0xC59AFB23,0x041424E3,0x2D2F3160,0xECA1EEA0,0x754388A1,0xB4CD5761, + 0x89C3676C,0x484DB8AC,0xD1AFDEAD,0x1021016D,0x391A14EE,0xF894CB2E,0x6176AD2F,0xA0F872EF, + 0x33008629,0xF28E59E9,0x6B6C3FE8,0xAAE2E028,0x83D9F5AB,0x42572A6B,0xDBB54C6A,0x1A3B93AA, + }, + + { + 0x00000000,0x9BA54C6F,0xEC3B9E9F,0x779ED2F0,0x03063B7F,0x98A37710,0xEF3DA5E0,0x7498E98F, + 0x060C76FE,0x9DA93A91,0xEA37E861,0x7192A40E,0x050A4D81,0x9EAF01EE,0xE931D31E,0x72949F71, + 0x0C18EDFC,0x97BDA193,0xE0237363,0x7B863F0C,0x0F1ED683,0x94BB9AEC,0xE325481C,0x78800473, + 0x0A149B02,0x91B1D76D,0xE62F059D,0x7D8A49F2,0x0912A07D,0x92B7EC12,0xE5293EE2,0x7E8C728D, + 0x1831DBF8,0x83949797,0xF40A4567,0x6FAF0908,0x1B37E087,0x8092ACE8,0xF70C7E18,0x6CA93277, + 0x1E3DAD06,0x8598E169,0xF2063399,0x69A37FF6,0x1D3B9679,0x869EDA16,0xF10008E6,0x6AA54489, + 0x14293604,0x8F8C7A6B,0xF812A89B,0x63B7E4F4,0x172F0D7B,0x8C8A4114,0xFB1493E4,0x60B1DF8B, + 0x122540FA,0x89800C95,0xFE1EDE65,0x65BB920A,0x11237B85,0x8A8637EA,0xFD18E51A,0x66BDA975, + 0x3063B7F0,0xABC6FB9F,0xDC58296F,0x47FD6500,0x33658C8F,0xA8C0C0E0,0xDF5E1210,0x44FB5E7F, + 0x366FC10E,0xADCA8D61,0xDA545F91,0x41F113FE,0x3569FA71,0xAECCB61E,0xD95264EE,0x42F72881, + 0x3C7B5A0C,0xA7DE1663,0xD040C493,0x4BE588FC,0x3F7D6173,0xA4D82D1C,0xD346FFEC,0x48E3B383, + 0x3A772CF2,0xA1D2609D,0xD64CB26D,0x4DE9FE02,0x3971178D,0xA2D45BE2,0xD54A8912,0x4EEFC57D, + 0x28526C08,0xB3F72067,0xC469F297,0x5FCCBEF8,0x2B545777,0xB0F11B18,0xC76FC9E8,0x5CCA8587, + 0x2E5E1AF6,0xB5FB5699,0xC2658469,0x59C0C806,0x2D582189,0xB6FD6DE6,0xC163BF16,0x5AC6F379, + 0x244A81F4,0xBFEFCD9B,0xC8711F6B,0x53D45304,0x274CBA8B,0xBCE9F6E4,0xCB772414,0x50D2687B, + 0x2246F70A,0xB9E3BB65,0xCE7D6995,0x55D825FA,0x2140CC75,0xBAE5801A,0xCD7B52EA,0x56DE1E85, + 0x60C76FE0,0xFB62238F,0x8CFCF17F,0x1759BD10,0x63C1549F,0xF86418F0,0x8FFACA00,0x145F866F, + 0x66CB191E,0xFD6E5571,0x8AF08781,0x1155CBEE,0x65CD2261,0xFE686E0E,0x89F6BCFE,0x1253F091, + 0x6CDF821C,0xF77ACE73,0x80E41C83,0x1B4150EC,0x6FD9B963,0xF47CF50C,0x83E227FC,0x18476B93, + 0x6AD3F4E2,0xF176B88D,0x86E86A7D,0x1D4D2612,0x69D5CF9D,0xF27083F2,0x85EE5102,0x1E4B1D6D, + 0x78F6B418,0xE353F877,0x94CD2A87,0x0F6866E8,0x7BF08F67,0xE055C308,0x97CB11F8,0x0C6E5D97, + 0x7EFAC2E6,0xE55F8E89,0x92C15C79,0x09641016,0x7DFCF999,0xE659B5F6,0x91C76706,0x0A622B69, + 0x74EE59E4,0xEF4B158B,0x98D5C77B,0x03708B14,0x77E8629B,0xEC4D2EF4,0x9BD3FC04,0x0076B06B, + 0x72E22F1A,0xE9476375,0x9ED9B185,0x057CFDEA,0x71E41465,0xEA41580A,0x9DDF8AFA,0x067AC695, + 0x50A4D810,0xCB01947F,0xBC9F468F,0x273A0AE0,0x53A2E36F,0xC807AF00,0xBF997DF0,0x243C319F, + 0x56A8AEEE,0xCD0DE281,0xBA933071,0x21367C1E,0x55AE9591,0xCE0BD9FE,0xB9950B0E,0x22304761, + 0x5CBC35EC,0xC7197983,0xB087AB73,0x2B22E71C,0x5FBA0E93,0xC41F42FC,0xB381900C,0x2824DC63, + 0x5AB04312,0xC1150F7D,0xB68BDD8D,0x2D2E91E2,0x59B6786D,0xC2133402,0xB58DE6F2,0x2E28AA9D, + 0x489503E8,0xD3304F87,0xA4AE9D77,0x3F0BD118,0x4B933897,0xD03674F8,0xA7A8A608,0x3C0DEA67, + 0x4E997516,0xD53C3979,0xA2A2EB89,0x3907A7E6,0x4D9F4E69,0xD63A0206,0xA1A4D0F6,0x3A019C99, + 0x448DEE14,0xDF28A27B,0xA8B6708B,0x33133CE4,0x478BD56B,0xDC2E9904,0xABB04BF4,0x3015079B, + 0x428198EA,0xD924D485,0xAEBA0675,0x351F4A1A,0x4187A395,0xDA22EFFA,0xADBC3D0A,0x36197165, + }, + + { + 0x00000000,0xDD96D985,0x605CB54B,0xBDCA6CCE,0xC0B96A96,0x1D2FB313,0xA0E5DFDD,0x7D730658, + 0x5A03D36D,0x87950AE8,0x3A5F6626,0xE7C9BFA3,0x9ABAB9FB,0x472C607E,0xFAE60CB0,0x2770D535, + 0xB407A6DA,0x69917F5F,0xD45B1391,0x09CDCA14,0x74BECC4C,0xA92815C9,0x14E27907,0xC974A082, + 0xEE0475B7,0x3392AC32,0x8E58C0FC,0x53CE1979,0x2EBD1F21,0xF32BC6A4,0x4EE1AA6A,0x937773EF, + 0xB37E4BF5,0x6EE89270,0xD322FEBE,0x0EB4273B,0x73C72163,0xAE51F8E6,0x139B9428,0xCE0D4DAD, + 0xE97D9898,0x34EB411D,0x89212DD3,0x54B7F456,0x29C4F20E,0xF4522B8B,0x49984745,0x940E9EC0, + 0x0779ED2F,0xDAEF34AA,0x67255864,0xBAB381E1,0xC7C087B9,0x1A565E3C,0xA79C32F2,0x7A0AEB77, + 0x5D7A3E42,0x80ECE7C7,0x3D268B09,0xE0B0528C,0x9DC354D4,0x40558D51,0xFD9FE19F,0x2009381A, + 0xBD8D91AB,0x601B482E,0xDDD124E0,0x0047FD65,0x7D34FB3D,0xA0A222B8,0x1D684E76,0xC0FE97F3, + 0xE78E42C6,0x3A189B43,0x87D2F78D,0x5A442E08,0x27372850,0xFAA1F1D5,0x476B9D1B,0x9AFD449E, + 0x098A3771,0xD41CEEF4,0x69D6823A,0xB4405BBF,0xC9335DE7,0x14A58462,0xA96FE8AC,0x74F93129, + 0x5389E41C,0x8E1F3D99,0x33D55157,0xEE4388D2,0x93308E8A,0x4EA6570F,0xF36C3BC1,0x2EFAE244, + 0x0EF3DA5E,0xD36503DB,0x6EAF6F15,0xB339B690,0xCE4AB0C8,0x13DC694D,0xAE160583,0x7380DC06, + 0x54F00933,0x8966D0B6,0x34ACBC78,0xE93A65FD,0x944963A5,0x49DFBA20,0xF415D6EE,0x29830F6B, + 0xBAF47C84,0x6762A501,0xDAA8C9CF,0x073E104A,0x7A4D1612,0xA7DBCF97,0x1A11A359,0xC7877ADC, + 0xE0F7AFE9,0x3D61766C,0x80AB1AA2,0x5D3DC327,0x204EC57F,0xFDD81CFA,0x40127034,0x9D84A9B1, + 0xA06A2517,0x7DFCFC92,0xC036905C,0x1DA049D9,0x60D34F81,0xBD459604,0x008FFACA,0xDD19234F, + 0xFA69F67A,0x27FF2FFF,0x9A354331,0x47A39AB4,0x3AD09CEC,0xE7464569,0x5A8C29A7,0x871AF022, + 0x146D83CD,0xC9FB5A48,0x74313686,0xA9A7EF03,0xD4D4E95B,0x094230DE,0xB4885C10,0x691E8595, + 0x4E6E50A0,0x93F88925,0x2E32E5EB,0xF3A43C6E,0x8ED73A36,0x5341E3B3,0xEE8B8F7D,0x331D56F8, + 0x13146EE2,0xCE82B767,0x7348DBA9,0xAEDE022C,0xD3AD0474,0x0E3BDDF1,0xB3F1B13F,0x6E6768BA, + 0x4917BD8F,0x9481640A,0x294B08C4,0xF4DDD141,0x89AED719,0x54380E9C,0xE9F26252,0x3464BBD7, + 0xA713C838,0x7A8511BD,0xC74F7D73,0x1AD9A4F6,0x67AAA2AE,0xBA3C7B2B,0x07F617E5,0xDA60CE60, + 0xFD101B55,0x2086C2D0,0x9D4CAE1E,0x40DA779B,0x3DA971C3,0xE03FA846,0x5DF5C488,0x80631D0D, + 0x1DE7B4BC,0xC0716D39,0x7DBB01F7,0xA02DD872,0xDD5EDE2A,0x00C807AF,0xBD026B61,0x6094B2E4, + 0x47E467D1,0x9A72BE54,0x27B8D29A,0xFA2E0B1F,0x875D0D47,0x5ACBD4C2,0xE701B80C,0x3A976189, + 0xA9E01266,0x7476CBE3,0xC9BCA72D,0x142A7EA8,0x695978F0,0xB4CFA175,0x0905CDBB,0xD493143E, + 0xF3E3C10B,0x2E75188E,0x93BF7440,0x4E29ADC5,0x335AAB9D,0xEECC7218,0x53061ED6,0x8E90C753, + 0xAE99FF49,0x730F26CC,0xCEC54A02,0x13539387,0x6E2095DF,0xB3B64C5A,0x0E7C2094,0xD3EAF911, + 0xF49A2C24,0x290CF5A1,0x94C6996F,0x495040EA,0x342346B2,0xE9B59F37,0x547FF3F9,0x89E92A7C, + 0x1A9E5993,0xC7088016,0x7AC2ECD8,0xA754355D,0xDA273305,0x07B1EA80,0xBA7B864E,0x67ED5FCB, + 0x409D8AFE,0x9D0B537B,0x20C13FB5,0xFD57E630,0x8024E068,0x5DB239ED,0xE0785523,0x3DEE8CA6, + }, + + { + 0x00000000,0x9D0FE176,0xE16EC4AD,0x7C6125DB,0x19AC8F1B,0x84A36E6D,0xF8C24BB6,0x65CDAAC0, + 0x33591E36,0xAE56FF40,0xD237DA9B,0x4F383BED,0x2AF5912D,0xB7FA705B,0xCB9B5580,0x5694B4F6, + 0x66B23C6C,0xFBBDDD1A,0x87DCF8C1,0x1AD319B7,0x7F1EB377,0xE2115201,0x9E7077DA,0x037F96AC, + 0x55EB225A,0xC8E4C32C,0xB485E6F7,0x298A0781,0x4C47AD41,0xD1484C37,0xAD2969EC,0x3026889A, + 0xCD6478D8,0x506B99AE,0x2C0ABC75,0xB1055D03,0xD4C8F7C3,0x49C716B5,0x35A6336E,0xA8A9D218, + 0xFE3D66EE,0x63328798,0x1F53A243,0x825C4335,0xE791E9F5,0x7A9E0883,0x06FF2D58,0x9BF0CC2E, + 0xABD644B4,0x36D9A5C2,0x4AB88019,0xD7B7616F,0xB27ACBAF,0x2F752AD9,0x53140F02,0xCE1BEE74, + 0x988F5A82,0x0580BBF4,0x79E19E2F,0xE4EE7F59,0x8123D599,0x1C2C34EF,0x604D1134,0xFD42F042, + 0x41B9F7F1,0xDCB61687,0xA0D7335C,0x3DD8D22A,0x581578EA,0xC51A999C,0xB97BBC47,0x24745D31, + 0x72E0E9C7,0xEFEF08B1,0x938E2D6A,0x0E81CC1C,0x6B4C66DC,0xF64387AA,0x8A22A271,0x172D4307, + 0x270BCB9D,0xBA042AEB,0xC6650F30,0x5B6AEE46,0x3EA74486,0xA3A8A5F0,0xDFC9802B,0x42C6615D, + 0x1452D5AB,0x895D34DD,0xF53C1106,0x6833F070,0x0DFE5AB0,0x90F1BBC6,0xEC909E1D,0x719F7F6B, + 0x8CDD8F29,0x11D26E5F,0x6DB34B84,0xF0BCAAF2,0x95710032,0x087EE144,0x741FC49F,0xE91025E9, + 0xBF84911F,0x228B7069,0x5EEA55B2,0xC3E5B4C4,0xA6281E04,0x3B27FF72,0x4746DAA9,0xDA493BDF, + 0xEA6FB345,0x77605233,0x0B0177E8,0x960E969E,0xF3C33C5E,0x6ECCDD28,0x12ADF8F3,0x8FA21985, + 0xD936AD73,0x44394C05,0x385869DE,0xA55788A8,0xC09A2268,0x5D95C31E,0x21F4E6C5,0xBCFB07B3, + 0x8373EFE2,0x1E7C0E94,0x621D2B4F,0xFF12CA39,0x9ADF60F9,0x07D0818F,0x7BB1A454,0xE6BE4522, + 0xB02AF1D4,0x2D2510A2,0x51443579,0xCC4BD40F,0xA9867ECF,0x34899FB9,0x48E8BA62,0xD5E75B14, + 0xE5C1D38E,0x78CE32F8,0x04AF1723,0x99A0F655,0xFC6D5C95,0x6162BDE3,0x1D039838,0x800C794E, + 0xD698CDB8,0x4B972CCE,0x37F60915,0xAAF9E863,0xCF3442A3,0x523BA3D5,0x2E5A860E,0xB3556778, + 0x4E17973A,0xD318764C,0xAF795397,0x3276B2E1,0x57BB1821,0xCAB4F957,0xB6D5DC8C,0x2BDA3DFA, + 0x7D4E890C,0xE041687A,0x9C204DA1,0x012FACD7,0x64E20617,0xF9EDE761,0x858CC2BA,0x188323CC, + 0x28A5AB56,0xB5AA4A20,0xC9CB6FFB,0x54C48E8D,0x3109244D,0xAC06C53B,0xD067E0E0,0x4D680196, + 0x1BFCB560,0x86F35416,0xFA9271CD,0x679D90BB,0x02503A7B,0x9F5FDB0D,0xE33EFED6,0x7E311FA0, + 0xC2CA1813,0x5FC5F965,0x23A4DCBE,0xBEAB3DC8,0xDB669708,0x4669767E,0x3A0853A5,0xA707B2D3, + 0xF1930625,0x6C9CE753,0x10FDC288,0x8DF223FE,0xE83F893E,0x75306848,0x09514D93,0x945EACE5, + 0xA478247F,0x3977C509,0x4516E0D2,0xD81901A4,0xBDD4AB64,0x20DB4A12,0x5CBA6FC9,0xC1B58EBF, + 0x97213A49,0x0A2EDB3F,0x764FFEE4,0xEB401F92,0x8E8DB552,0x13825424,0x6FE371FF,0xF2EC9089, + 0x0FAE60CB,0x92A181BD,0xEEC0A466,0x73CF4510,0x1602EFD0,0x8B0D0EA6,0xF76C2B7D,0x6A63CA0B, + 0x3CF77EFD,0xA1F89F8B,0xDD99BA50,0x40965B26,0x255BF1E6,0xB8541090,0xC435354B,0x593AD43D, + 0x691C5CA7,0xF413BDD1,0x8872980A,0x157D797C,0x70B0D3BC,0xEDBF32CA,0x91DE1711,0x0CD1F667, + 0x5A454291,0xC74AA3E7,0xBB2B863C,0x2624674A,0x43E9CD8A,0xDEE62CFC,0xA2870927,0x3F88E851, + }, + + { + 0x00000000,0xB9FBDBE8,0xA886B191,0x117D6A79,0x8A7C6563,0x3387BE8B,0x22FAD4F2,0x9B010F1A, + 0xCF89CC87,0x7672176F,0x670F7D16,0xDEF4A6FE,0x45F5A9E4,0xFC0E720C,0xED731875,0x5488C39D, + 0x44629F4F,0xFD9944A7,0xECE42EDE,0x551FF536,0xCE1EFA2C,0x77E521C4,0x66984BBD,0xDF639055, + 0x8BEB53C8,0x32108820,0x236DE259,0x9A9639B1,0x019736AB,0xB86CED43,0xA911873A,0x10EA5CD2, + 0x88C53E9E,0x313EE576,0x20438F0F,0x99B854E7,0x02B95BFD,0xBB428015,0xAA3FEA6C,0x13C43184, + 0x474CF219,0xFEB729F1,0xEFCA4388,0x56319860,0xCD30977A,0x74CB4C92,0x65B626EB,0xDC4DFD03, + 0xCCA7A1D1,0x755C7A39,0x64211040,0xDDDACBA8,0x46DBC4B2,0xFF201F5A,0xEE5D7523,0x57A6AECB, + 0x032E6D56,0xBAD5B6BE,0xABA8DCC7,0x1253072F,0x89520835,0x30A9D3DD,0x21D4B9A4,0x982F624C, + 0xCAFB7B7D,0x7300A095,0x627DCAEC,0xDB861104,0x40871E1E,0xF97CC5F6,0xE801AF8F,0x51FA7467, + 0x0572B7FA,0xBC896C12,0xADF4066B,0x140FDD83,0x8F0ED299,0x36F50971,0x27886308,0x9E73B8E0, + 0x8E99E432,0x37623FDA,0x261F55A3,0x9FE48E4B,0x04E58151,0xBD1E5AB9,0xAC6330C0,0x1598EB28, + 0x411028B5,0xF8EBF35D,0xE9969924,0x506D42CC,0xCB6C4DD6,0x7297963E,0x63EAFC47,0xDA1127AF, + 0x423E45E3,0xFBC59E0B,0xEAB8F472,0x53432F9A,0xC8422080,0x71B9FB68,0x60C49111,0xD93F4AF9, + 0x8DB78964,0x344C528C,0x253138F5,0x9CCAE31D,0x07CBEC07,0xBE3037EF,0xAF4D5D96,0x16B6867E, + 0x065CDAAC,0xBFA70144,0xAEDA6B3D,0x1721B0D5,0x8C20BFCF,0x35DB6427,0x24A60E5E,0x9D5DD5B6, + 0xC9D5162B,0x702ECDC3,0x6153A7BA,0xD8A87C52,0x43A97348,0xFA52A8A0,0xEB2FC2D9,0x52D41931, + 0x4E87F0BB,0xF77C2B53,0xE601412A,0x5FFA9AC2,0xC4FB95D8,0x7D004E30,0x6C7D2449,0xD586FFA1, + 0x810E3C3C,0x38F5E7D4,0x29888DAD,0x90735645,0x0B72595F,0xB28982B7,0xA3F4E8CE,0x1A0F3326, + 0x0AE56FF4,0xB31EB41C,0xA263DE65,0x1B98058D,0x80990A97,0x3962D17F,0x281FBB06,0x91E460EE, + 0xC56CA373,0x7C97789B,0x6DEA12E2,0xD411C90A,0x4F10C610,0xF6EB1DF8,0xE7967781,0x5E6DAC69, + 0xC642CE25,0x7FB915CD,0x6EC47FB4,0xD73FA45C,0x4C3EAB46,0xF5C570AE,0xE4B81AD7,0x5D43C13F, + 0x09CB02A2,0xB030D94A,0xA14DB333,0x18B668DB,0x83B767C1,0x3A4CBC29,0x2B31D650,0x92CA0DB8, + 0x8220516A,0x3BDB8A82,0x2AA6E0FB,0x935D3B13,0x085C3409,0xB1A7EFE1,0xA0DA8598,0x19215E70, + 0x4DA99DED,0xF4524605,0xE52F2C7C,0x5CD4F794,0xC7D5F88E,0x7E2E2366,0x6F53491F,0xD6A892F7, + 0x847C8BC6,0x3D87502E,0x2CFA3A57,0x9501E1BF,0x0E00EEA5,0xB7FB354D,0xA6865F34,0x1F7D84DC, + 0x4BF54741,0xF20E9CA9,0xE373F6D0,0x5A882D38,0xC1892222,0x7872F9CA,0x690F93B3,0xD0F4485B, + 0xC01E1489,0x79E5CF61,0x6898A518,0xD1637EF0,0x4A6271EA,0xF399AA02,0xE2E4C07B,0x5B1F1B93, + 0x0F97D80E,0xB66C03E6,0xA711699F,0x1EEAB277,0x85EBBD6D,0x3C106685,0x2D6D0CFC,0x9496D714, + 0x0CB9B558,0xB5426EB0,0xA43F04C9,0x1DC4DF21,0x86C5D03B,0x3F3E0BD3,0x2E4361AA,0x97B8BA42, + 0xC33079DF,0x7ACBA237,0x6BB6C84E,0xD24D13A6,0x494C1CBC,0xF0B7C754,0xE1CAAD2D,0x583176C5, + 0x48DB2A17,0xF120F1FF,0xE05D9B86,0x59A6406E,0xC2A74F74,0x7B5C949C,0x6A21FEE5,0xD3DA250D, + 0x8752E690,0x3EA93D78,0x2FD45701,0x962F8CE9,0x0D2E83F3,0xB4D5581B,0xA5A83262,0x1C53E98A, + }, + + { + 0x00000000,0xAE689191,0x87A02563,0x29C8B4F2,0xD4314C87,0x7A59DD16,0x539169E4,0xFDF9F875, + 0x73139F4F,0xDD7B0EDE,0xF4B3BA2C,0x5ADB2BBD,0xA722D3C8,0x094A4259,0x2082F6AB,0x8EEA673A, + 0xE6273E9E,0x484FAF0F,0x61871BFD,0xCFEF8A6C,0x32167219,0x9C7EE388,0xB5B6577A,0x1BDEC6EB, + 0x9534A1D1,0x3B5C3040,0x129484B2,0xBCFC1523,0x4105ED56,0xEF6D7CC7,0xC6A5C835,0x68CD59A4, + 0x173F7B7D,0xB957EAEC,0x909F5E1E,0x3EF7CF8F,0xC30E37FA,0x6D66A66B,0x44AE1299,0xEAC68308, + 0x642CE432,0xCA4475A3,0xE38CC151,0x4DE450C0,0xB01DA8B5,0x1E753924,0x37BD8DD6,0x99D51C47, + 0xF11845E3,0x5F70D472,0x76B86080,0xD8D0F111,0x25290964,0x8B4198F5,0xA2892C07,0x0CE1BD96, + 0x820BDAAC,0x2C634B3D,0x05ABFFCF,0xABC36E5E,0x563A962B,0xF85207BA,0xD19AB348,0x7FF222D9, + 0x2E7EF6FA,0x8016676B,0xA9DED399,0x07B64208,0xFA4FBA7D,0x54272BEC,0x7DEF9F1E,0xD3870E8F, + 0x5D6D69B5,0xF305F824,0xDACD4CD6,0x74A5DD47,0x895C2532,0x2734B4A3,0x0EFC0051,0xA09491C0, + 0xC859C864,0x663159F5,0x4FF9ED07,0xE1917C96,0x1C6884E3,0xB2001572,0x9BC8A180,0x35A03011, + 0xBB4A572B,0x1522C6BA,0x3CEA7248,0x9282E3D9,0x6F7B1BAC,0xC1138A3D,0xE8DB3ECF,0x46B3AF5E, + 0x39418D87,0x97291C16,0xBEE1A8E4,0x10893975,0xED70C100,0x43185091,0x6AD0E463,0xC4B875F2, + 0x4A5212C8,0xE43A8359,0xCDF237AB,0x639AA63A,0x9E635E4F,0x300BCFDE,0x19C37B2C,0xB7ABEABD, + 0xDF66B319,0x710E2288,0x58C6967A,0xF6AE07EB,0x0B57FF9E,0xA53F6E0F,0x8CF7DAFD,0x229F4B6C, + 0xAC752C56,0x021DBDC7,0x2BD50935,0x85BD98A4,0x784460D1,0xD62CF140,0xFFE445B2,0x518CD423, + 0x5CFDEDF4,0xF2957C65,0xDB5DC897,0x75355906,0x88CCA173,0x26A430E2,0x0F6C8410,0xA1041581, + 0x2FEE72BB,0x8186E32A,0xA84E57D8,0x0626C649,0xFBDF3E3C,0x55B7AFAD,0x7C7F1B5F,0xD2178ACE, + 0xBADAD36A,0x14B242FB,0x3D7AF609,0x93126798,0x6EEB9FED,0xC0830E7C,0xE94BBA8E,0x47232B1F, + 0xC9C94C25,0x67A1DDB4,0x4E696946,0xE001F8D7,0x1DF800A2,0xB3909133,0x9A5825C1,0x3430B450, + 0x4BC29689,0xE5AA0718,0xCC62B3EA,0x620A227B,0x9FF3DA0E,0x319B4B9F,0x1853FF6D,0xB63B6EFC, + 0x38D109C6,0x96B99857,0xBF712CA5,0x1119BD34,0xECE04541,0x4288D4D0,0x6B406022,0xC528F1B3, + 0xADE5A817,0x038D3986,0x2A458D74,0x842D1CE5,0x79D4E490,0xD7BC7501,0xFE74C1F3,0x501C5062, + 0xDEF63758,0x709EA6C9,0x5956123B,0xF73E83AA,0x0AC77BDF,0xA4AFEA4E,0x8D675EBC,0x230FCF2D, + 0x72831B0E,0xDCEB8A9F,0xF5233E6D,0x5B4BAFFC,0xA6B25789,0x08DAC618,0x211272EA,0x8F7AE37B, + 0x01908441,0xAFF815D0,0x8630A122,0x285830B3,0xD5A1C8C6,0x7BC95957,0x5201EDA5,0xFC697C34, + 0x94A42590,0x3ACCB401,0x130400F3,0xBD6C9162,0x40956917,0xEEFDF886,0xC7354C74,0x695DDDE5, + 0xE7B7BADF,0x49DF2B4E,0x60179FBC,0xCE7F0E2D,0x3386F658,0x9DEE67C9,0xB426D33B,0x1A4E42AA, + 0x65BC6073,0xCBD4F1E2,0xE21C4510,0x4C74D481,0xB18D2CF4,0x1FE5BD65,0x362D0997,0x98459806, + 0x16AFFF3C,0xB8C76EAD,0x910FDA5F,0x3F674BCE,0xC29EB3BB,0x6CF6222A,0x453E96D8,0xEB560749, + 0x839B5EED,0x2DF3CF7C,0x043B7B8E,0xAA53EA1F,0x57AA126A,0xF9C283FB,0xD00A3709,0x7E62A698, + 0xF088C1A2,0x5EE05033,0x7728E4C1,0xD9407550,0x24B98D25,0x8AD11CB4,0xA319A846,0x0D7139D7, + } +#endif // CRC32_USE_LOOKUP_TABLE_SLICING_BY_16 +}; +#endif diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/cpuinfo.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/cpuinfo.h new file mode 100644 index 0000000000000000000000000000000000000000..dfb535f1c9e25d133e98253370c917a306c57119 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/cpuinfo.h @@ -0,0 +1,1956 @@ +#pragma once +#ifndef CPUINFO_H +#define CPUINFO_H + +#ifndef __cplusplus + #include +#endif + +#ifdef __APPLE__ + #include +#endif + +#include + +/* Identify architecture and define corresponding macro */ + +#if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || defined(_M_IX86) + #define CPUINFO_ARCH_X86 1 +#endif + +#if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64) + #define CPUINFO_ARCH_X86_64 1 +#endif + +#if defined(__arm__) || defined(_M_ARM) + #define CPUINFO_ARCH_ARM 1 +#endif + +#if defined(__aarch64__) || defined(_M_ARM64) + #define CPUINFO_ARCH_ARM64 1 +#endif + +#if defined(__PPC64__) || defined(__powerpc64__) || defined(_ARCH_PPC64) + #define CPUINFO_ARCH_PPC64 1 +#endif + +#if defined(__asmjs__) + #define CPUINFO_ARCH_ASMJS 1 +#endif + +#if defined(__wasm__) + #if defined(__wasm_simd128__) + #define CPUINFO_ARCH_WASMSIMD 1 + #else + #define CPUINFO_ARCH_WASM 1 + #endif +#endif + +/* Define other architecture-specific macros as 0 */ + +#ifndef CPUINFO_ARCH_X86 + #define CPUINFO_ARCH_X86 0 +#endif + +#ifndef CPUINFO_ARCH_X86_64 + #define CPUINFO_ARCH_X86_64 0 +#endif + +#ifndef CPUINFO_ARCH_ARM + #define CPUINFO_ARCH_ARM 0 +#endif + +#ifndef CPUINFO_ARCH_ARM64 + #define CPUINFO_ARCH_ARM64 0 +#endif + +#ifndef CPUINFO_ARCH_PPC64 + #define CPUINFO_ARCH_PPC64 0 +#endif + +#ifndef CPUINFO_ARCH_ASMJS + #define CPUINFO_ARCH_ASMJS 0 +#endif + +#ifndef CPUINFO_ARCH_WASM + #define CPUINFO_ARCH_WASM 0 +#endif + +#ifndef CPUINFO_ARCH_WASMSIMD + #define CPUINFO_ARCH_WASMSIMD 0 +#endif + +#if CPUINFO_ARCH_X86 && defined(_MSC_VER) + #define CPUINFO_ABI __cdecl +#elif CPUINFO_ARCH_X86 && defined(__GNUC__) + #define CPUINFO_ABI __attribute__((__cdecl__)) +#else + #define CPUINFO_ABI +#endif + +#define CPUINFO_CACHE_UNIFIED 0x00000001 +#define CPUINFO_CACHE_INCLUSIVE 0x00000002 +#define CPUINFO_CACHE_COMPLEX_INDEXING 0x00000004 + +struct cpuinfo_cache { + /** Cache size in bytes */ + uint32_t size; + /** Number of ways of associativity */ + uint32_t associativity; + /** Number of sets */ + uint32_t sets; + /** Number of partitions */ + uint32_t partitions; + /** Line size in bytes */ + uint32_t line_size; + /** + * Binary characteristics of the cache (unified cache, inclusive cache, cache with complex indexing). + * + * @see CPUINFO_CACHE_UNIFIED, CPUINFO_CACHE_INCLUSIVE, CPUINFO_CACHE_COMPLEX_INDEXING + */ + uint32_t flags; + /** Index of the first logical processor that shares this cache */ + uint32_t processor_start; + /** Number of logical processors that share this cache */ + uint32_t processor_count; +}; + +struct cpuinfo_trace_cache { + uint32_t uops; + uint32_t associativity; +}; + +#define CPUINFO_PAGE_SIZE_4KB 0x1000 +#define CPUINFO_PAGE_SIZE_1MB 0x100000 +#define CPUINFO_PAGE_SIZE_2MB 0x200000 +#define CPUINFO_PAGE_SIZE_4MB 0x400000 +#define CPUINFO_PAGE_SIZE_16MB 0x1000000 +#define CPUINFO_PAGE_SIZE_1GB 0x40000000 + +struct cpuinfo_tlb { + uint32_t entries; + uint32_t associativity; + uint64_t pages; +}; + +/** Vendor of processor core design */ +enum cpuinfo_vendor { + /** Processor vendor is not known to the library, or the library failed to get vendor information from the OS. */ + cpuinfo_vendor_unknown = 0, + + /* Active vendors of modern CPUs */ + + /** + * Intel Corporation. Vendor of x86, x86-64, IA64, and ARM processor microarchitectures. + * + * Sold its ARM design subsidiary in 2006. The last ARM processor design was released in 2004. + */ + cpuinfo_vendor_intel = 1, + /** Advanced Micro Devices, Inc. Vendor of x86 and x86-64 processor microarchitectures. */ + cpuinfo_vendor_amd = 2, + /** ARM Holdings plc. Vendor of ARM and ARM64 processor microarchitectures. */ + cpuinfo_vendor_arm = 3, + /** Qualcomm Incorporated. Vendor of ARM and ARM64 processor microarchitectures. */ + cpuinfo_vendor_qualcomm = 4, + /** Apple Inc. Vendor of ARM and ARM64 processor microarchitectures. */ + cpuinfo_vendor_apple = 5, + /** Samsung Electronics Co., Ltd. Vendir if ARM64 processor microarchitectures. */ + cpuinfo_vendor_samsung = 6, + /** Nvidia Corporation. Vendor of ARM64-compatible processor microarchitectures. */ + cpuinfo_vendor_nvidia = 7, + /** MIPS Technologies, Inc. Vendor of MIPS processor microarchitectures. */ + cpuinfo_vendor_mips = 8, + /** International Business Machines Corporation. Vendor of PowerPC processor microarchitectures. */ + cpuinfo_vendor_ibm = 9, + /** Ingenic Semiconductor. Vendor of MIPS processor microarchitectures. */ + cpuinfo_vendor_ingenic = 10, + /** + * VIA Technologies, Inc. Vendor of x86 and x86-64 processor microarchitectures. + * + * Processors are designed by Centaur Technology, a subsidiary of VIA Technologies. + */ + cpuinfo_vendor_via = 11, + /** Cavium, Inc. Vendor of ARM64 processor microarchitectures. */ + cpuinfo_vendor_cavium = 12, + /** Broadcom, Inc. Vendor of ARM processor microarchitectures. */ + cpuinfo_vendor_broadcom = 13, + /** Applied Micro Circuits Corporation (APM). Vendor of ARM64 processor microarchitectures. */ + cpuinfo_vendor_apm = 14, + /** + * Huawei Technologies Co., Ltd. Vendor of ARM64 processor microarchitectures. + * + * Processors are designed by HiSilicon, a subsidiary of Huawei. + */ + cpuinfo_vendor_huawei = 15, + /** + * Hygon (Chengdu Haiguang Integrated Circuit Design Co., Ltd), Vendor of x86-64 processor microarchitectures. + * + * Processors are variants of AMD cores. + */ + cpuinfo_vendor_hygon = 16, + + /* Active vendors of embedded CPUs */ + + /** Texas Instruments Inc. Vendor of ARM processor microarchitectures. */ + cpuinfo_vendor_texas_instruments = 30, + /** Marvell Technology Group Ltd. Vendor of ARM processor microarchitectures. */ + cpuinfo_vendor_marvell = 31, + /** RDC Semiconductor Co., Ltd. Vendor of x86 processor microarchitectures. */ + cpuinfo_vendor_rdc = 32, + /** DM&P Electronics Inc. Vendor of x86 processor microarchitectures. */ + cpuinfo_vendor_dmp = 33, + /** Motorola, Inc. Vendor of PowerPC and ARM processor microarchitectures. */ + cpuinfo_vendor_motorola = 34, + + /* Defunct CPU vendors */ + + /** + * Transmeta Corporation. Vendor of x86 processor microarchitectures. + * + * Now defunct. The last processor design was released in 2004. + * Transmeta processors implemented VLIW ISA and used binary translation to execute x86 code. + */ + cpuinfo_vendor_transmeta = 50, + /** + * Cyrix Corporation. Vendor of x86 processor microarchitectures. + * + * Now defunct. The last processor design was released in 1996. + */ + cpuinfo_vendor_cyrix = 51, + /** + * Rise Technology. Vendor of x86 processor microarchitectures. + * + * Now defunct. The last processor design was released in 1999. + */ + cpuinfo_vendor_rise = 52, + /** + * National Semiconductor. Vendor of x86 processor microarchitectures. + * + * Sold its x86 design subsidiary in 1999. The last processor design was released in 1998. + */ + cpuinfo_vendor_nsc = 53, + /** + * Silicon Integrated Systems. Vendor of x86 processor microarchitectures. + * + * Sold its x86 design subsidiary in 2001. The last processor design was released in 2001. + */ + cpuinfo_vendor_sis = 54, + /** + * NexGen. Vendor of x86 processor microarchitectures. + * + * Now defunct. The last processor design was released in 1994. + * NexGen designed the first x86 microarchitecture which decomposed x86 instructions into simple microoperations. + */ + cpuinfo_vendor_nexgen = 55, + /** + * United Microelectronics Corporation. Vendor of x86 processor microarchitectures. + * + * Ceased x86 in the early 1990s. The last processor design was released in 1991. + * Designed U5C and U5D processors. Both are 486 level. + */ + cpuinfo_vendor_umc = 56, + /** + * Digital Equipment Corporation. Vendor of ARM processor microarchitecture. + * + * Sold its ARM designs in 1997. The last processor design was released in 1997. + */ + cpuinfo_vendor_dec = 57, +}; + +/** + * Processor microarchitecture + * + * Processors with different microarchitectures often have different instruction performance characteristics, + * and may have dramatically different pipeline organization. + */ +enum cpuinfo_uarch { + /** Microarchitecture is unknown, or the library failed to get information about the microarchitecture from OS */ + cpuinfo_uarch_unknown = 0, + + /** Pentium and Pentium MMX microarchitecture. */ + cpuinfo_uarch_p5 = 0x00100100, + /** Intel Quark microarchitecture. */ + cpuinfo_uarch_quark = 0x00100101, + + /** Pentium Pro, Pentium II, and Pentium III. */ + cpuinfo_uarch_p6 = 0x00100200, + /** Pentium M. */ + cpuinfo_uarch_dothan = 0x00100201, + /** Intel Core microarchitecture. */ + cpuinfo_uarch_yonah = 0x00100202, + /** Intel Core 2 microarchitecture on 65 nm process. */ + cpuinfo_uarch_conroe = 0x00100203, + /** Intel Core 2 microarchitecture on 45 nm process. */ + cpuinfo_uarch_penryn = 0x00100204, + /** Intel Nehalem and Westmere microarchitectures (Core i3/i5/i7 1st gen). */ + cpuinfo_uarch_nehalem = 0x00100205, + /** Intel Sandy Bridge microarchitecture (Core i3/i5/i7 2nd gen). */ + cpuinfo_uarch_sandy_bridge = 0x00100206, + /** Intel Ivy Bridge microarchitecture (Core i3/i5/i7 3rd gen). */ + cpuinfo_uarch_ivy_bridge = 0x00100207, + /** Intel Haswell microarchitecture (Core i3/i5/i7 4th gen). */ + cpuinfo_uarch_haswell = 0x00100208, + /** Intel Broadwell microarchitecture. */ + cpuinfo_uarch_broadwell = 0x00100209, + /** Intel Sky Lake microarchitecture (14 nm, including Kaby/Coffee/Whiskey/Amber/Comet/Cascade/Cooper Lake). */ + cpuinfo_uarch_sky_lake = 0x0010020A, + /** DEPRECATED (Intel Kaby Lake microarchitecture). */ + cpuinfo_uarch_kaby_lake = 0x0010020A, + /** Intel Palm Cove microarchitecture (10 nm, Cannon Lake). */ + cpuinfo_uarch_palm_cove = 0x0010020B, + /** Intel Sunny Cove microarchitecture (10 nm, Ice Lake). */ + cpuinfo_uarch_sunny_cove = 0x0010020C, + + /** Pentium 4 with Willamette, Northwood, or Foster cores. */ + cpuinfo_uarch_willamette = 0x00100300, + /** Pentium 4 with Prescott and later cores. */ + cpuinfo_uarch_prescott = 0x00100301, + + /** Intel Atom on 45 nm process. */ + cpuinfo_uarch_bonnell = 0x00100400, + /** Intel Atom on 32 nm process. */ + cpuinfo_uarch_saltwell = 0x00100401, + /** Intel Silvermont microarchitecture (22 nm out-of-order Atom). */ + cpuinfo_uarch_silvermont = 0x00100402, + /** Intel Airmont microarchitecture (14 nm out-of-order Atom). */ + cpuinfo_uarch_airmont = 0x00100403, + /** Intel Goldmont microarchitecture (Denverton, Apollo Lake). */ + cpuinfo_uarch_goldmont = 0x00100404, + /** Intel Goldmont Plus microarchitecture (Gemini Lake). */ + cpuinfo_uarch_goldmont_plus = 0x00100405, + + /** Intel Knights Ferry HPC boards. */ + cpuinfo_uarch_knights_ferry = 0x00100500, + /** Intel Knights Corner HPC boards (aka Xeon Phi). */ + cpuinfo_uarch_knights_corner = 0x00100501, + /** Intel Knights Landing microarchitecture (second-gen MIC). */ + cpuinfo_uarch_knights_landing = 0x00100502, + /** Intel Knights Hill microarchitecture (third-gen MIC). */ + cpuinfo_uarch_knights_hill = 0x00100503, + /** Intel Knights Mill Xeon Phi. */ + cpuinfo_uarch_knights_mill = 0x00100504, + + /** Intel/Marvell XScale series. */ + cpuinfo_uarch_xscale = 0x00100600, + + /** AMD K5. */ + cpuinfo_uarch_k5 = 0x00200100, + /** AMD K6 and alike. */ + cpuinfo_uarch_k6 = 0x00200101, + /** AMD Athlon and Duron. */ + cpuinfo_uarch_k7 = 0x00200102, + /** AMD Athlon 64, Opteron 64. */ + cpuinfo_uarch_k8 = 0x00200103, + /** AMD Family 10h (Barcelona, Istambul, Magny-Cours). */ + cpuinfo_uarch_k10 = 0x00200104, + /** + * AMD Bulldozer microarchitecture + * Zambezi FX-series CPUs, Zurich, Valencia and Interlagos Opteron CPUs. + */ + cpuinfo_uarch_bulldozer = 0x00200105, + /** + * AMD Piledriver microarchitecture + * Vishera FX-series CPUs, Trinity and Richland APUs, Delhi, Seoul, Abu Dhabi Opteron CPUs. + */ + cpuinfo_uarch_piledriver = 0x00200106, + /** AMD Steamroller microarchitecture (Kaveri APUs). */ + cpuinfo_uarch_steamroller = 0x00200107, + /** AMD Excavator microarchitecture (Carizzo APUs). */ + cpuinfo_uarch_excavator = 0x00200108, + /** AMD Zen microarchitecture (12/14 nm Ryzen and EPYC CPUs). */ + cpuinfo_uarch_zen = 0x00200109, + /** AMD Zen 2 microarchitecture (7 nm Ryzen and EPYC CPUs). */ + cpuinfo_uarch_zen2 = 0x0020010A, + /** AMD Zen 3 microarchitecture. */ + cpuinfo_uarch_zen3 = 0x0020010B, + /** AMD Zen 4 microarchitecture. */ + cpuinfo_uarch_zen4 = 0x0020010C, + + /** NSC Geode and AMD Geode GX and LX. */ + cpuinfo_uarch_geode = 0x00200200, + /** AMD Bobcat mobile microarchitecture. */ + cpuinfo_uarch_bobcat = 0x00200201, + /** AMD Jaguar mobile microarchitecture. */ + cpuinfo_uarch_jaguar = 0x00200202, + /** AMD Puma mobile microarchitecture. */ + cpuinfo_uarch_puma = 0x00200203, + + /** ARM7 series. */ + cpuinfo_uarch_arm7 = 0x00300100, + /** ARM9 series. */ + cpuinfo_uarch_arm9 = 0x00300101, + /** ARM 1136, ARM 1156, ARM 1176, or ARM 11MPCore. */ + cpuinfo_uarch_arm11 = 0x00300102, + + /** ARM Cortex-A5. */ + cpuinfo_uarch_cortex_a5 = 0x00300205, + /** ARM Cortex-A7. */ + cpuinfo_uarch_cortex_a7 = 0x00300207, + /** ARM Cortex-A8. */ + cpuinfo_uarch_cortex_a8 = 0x00300208, + /** ARM Cortex-A9. */ + cpuinfo_uarch_cortex_a9 = 0x00300209, + /** ARM Cortex-A12. */ + cpuinfo_uarch_cortex_a12 = 0x00300212, + /** ARM Cortex-A15. */ + cpuinfo_uarch_cortex_a15 = 0x00300215, + /** ARM Cortex-A17. */ + cpuinfo_uarch_cortex_a17 = 0x00300217, + + /** ARM Cortex-A32. */ + cpuinfo_uarch_cortex_a32 = 0x00300332, + /** ARM Cortex-A35. */ + cpuinfo_uarch_cortex_a35 = 0x00300335, + /** ARM Cortex-A53. */ + cpuinfo_uarch_cortex_a53 = 0x00300353, + /** ARM Cortex-A55 revision 0 (restricted dual-issue capabilities compared to revision 1+). */ + cpuinfo_uarch_cortex_a55r0 = 0x00300354, + /** ARM Cortex-A55. */ + cpuinfo_uarch_cortex_a55 = 0x00300355, + /** ARM Cortex-A57. */ + cpuinfo_uarch_cortex_a57 = 0x00300357, + /** ARM Cortex-A65. */ + cpuinfo_uarch_cortex_a65 = 0x00300365, + /** ARM Cortex-A72. */ + cpuinfo_uarch_cortex_a72 = 0x00300372, + /** ARM Cortex-A73. */ + cpuinfo_uarch_cortex_a73 = 0x00300373, + /** ARM Cortex-A75. */ + cpuinfo_uarch_cortex_a75 = 0x00300375, + /** ARM Cortex-A76. */ + cpuinfo_uarch_cortex_a76 = 0x00300376, + /** ARM Cortex-A77. */ + cpuinfo_uarch_cortex_a77 = 0x00300377, + /** ARM Cortex-A78. */ + cpuinfo_uarch_cortex_a78 = 0x00300378, + + /** ARM Neoverse N1. */ + cpuinfo_uarch_neoverse_n1 = 0x00300400, + /** ARM Neoverse E1. */ + cpuinfo_uarch_neoverse_e1 = 0x00300401, + /** ARM Neoverse V1. */ + cpuinfo_uarch_neoverse_v1 = 0x00300402, + /** ARM Neoverse N2. */ + cpuinfo_uarch_neoverse_n2 = 0x00300403, + /** ARM Neoverse V2. */ + cpuinfo_uarch_neoverse_v2 = 0x00300404, + + /** ARM Cortex-X1. */ + cpuinfo_uarch_cortex_x1 = 0x00300501, + /** ARM Cortex-X2. */ + cpuinfo_uarch_cortex_x2 = 0x00300502, + /** ARM Cortex-X3. */ + cpuinfo_uarch_cortex_x3 = 0x00300503, + + /** ARM Cortex-A510. */ + cpuinfo_uarch_cortex_a510 = 0x00300551, + /** ARM Cortex-A710. */ + cpuinfo_uarch_cortex_a710 = 0x00300571, + /** ARM Cortex-A715. */ + cpuinfo_uarch_cortex_a715 = 0x00300572, + + /** Qualcomm Scorpion. */ + cpuinfo_uarch_scorpion = 0x00400100, + /** Qualcomm Krait. */ + cpuinfo_uarch_krait = 0x00400101, + /** Qualcomm Kryo. */ + cpuinfo_uarch_kryo = 0x00400102, + /** Qualcomm Falkor. */ + cpuinfo_uarch_falkor = 0x00400103, + /** Qualcomm Saphira. */ + cpuinfo_uarch_saphira = 0x00400104, + + /** Nvidia Denver. */ + cpuinfo_uarch_denver = 0x00500100, + /** Nvidia Denver 2. */ + cpuinfo_uarch_denver2 = 0x00500101, + /** Nvidia Carmel. */ + cpuinfo_uarch_carmel = 0x00500102, + + /** Samsung Exynos M1 (Exynos 8890 big cores). */ + cpuinfo_uarch_exynos_m1 = 0x00600100, + /** Samsung Exynos M2 (Exynos 8895 big cores). */ + cpuinfo_uarch_exynos_m2 = 0x00600101, + /** Samsung Exynos M3 (Exynos 9810 big cores). */ + cpuinfo_uarch_exynos_m3 = 0x00600102, + /** Samsung Exynos M4 (Exynos 9820 big cores). */ + cpuinfo_uarch_exynos_m4 = 0x00600103, + /** Samsung Exynos M5 (Exynos 9830 big cores). */ + cpuinfo_uarch_exynos_m5 = 0x00600104, + + /* Deprecated synonym for Cortex-A76 */ + cpuinfo_uarch_cortex_a76ae = 0x00300376, + /* Deprecated names for Exynos. */ + cpuinfo_uarch_mongoose_m1 = 0x00600100, + cpuinfo_uarch_mongoose_m2 = 0x00600101, + cpuinfo_uarch_meerkat_m3 = 0x00600102, + cpuinfo_uarch_meerkat_m4 = 0x00600103, + + /** Apple A6 and A6X processors. */ + cpuinfo_uarch_swift = 0x00700100, + /** Apple A7 processor. */ + cpuinfo_uarch_cyclone = 0x00700101, + /** Apple A8 and A8X processor. */ + cpuinfo_uarch_typhoon = 0x00700102, + /** Apple A9 and A9X processor. */ + cpuinfo_uarch_twister = 0x00700103, + /** Apple A10 and A10X processor. */ + cpuinfo_uarch_hurricane = 0x00700104, + /** Apple A11 processor (big cores). */ + cpuinfo_uarch_monsoon = 0x00700105, + /** Apple A11 processor (little cores). */ + cpuinfo_uarch_mistral = 0x00700106, + /** Apple A12 processor (big cores). */ + cpuinfo_uarch_vortex = 0x00700107, + /** Apple A12 processor (little cores). */ + cpuinfo_uarch_tempest = 0x00700108, + /** Apple A13 processor (big cores). */ + cpuinfo_uarch_lightning = 0x00700109, + /** Apple A13 processor (little cores). */ + cpuinfo_uarch_thunder = 0x0070010A, + /** Apple A14 / M1 processor (big cores). */ + cpuinfo_uarch_firestorm = 0x0070010B, + /** Apple A14 / M1 processor (little cores). */ + cpuinfo_uarch_icestorm = 0x0070010C, + /** Apple A15 / M2 processor (big cores). */ + cpuinfo_uarch_avalanche = 0x0070010D, + /** Apple A15 / M2 processor (little cores). */ + cpuinfo_uarch_blizzard = 0x0070010E, + + /** Cavium ThunderX. */ + cpuinfo_uarch_thunderx = 0x00800100, + /** Cavium ThunderX2 (originally Broadcom Vulkan). */ + cpuinfo_uarch_thunderx2 = 0x00800200, + + /** Marvell PJ4. */ + cpuinfo_uarch_pj4 = 0x00900100, + + /** Broadcom Brahma B15. */ + cpuinfo_uarch_brahma_b15 = 0x00A00100, + /** Broadcom Brahma B53. */ + cpuinfo_uarch_brahma_b53 = 0x00A00101, + + /** Applied Micro X-Gene. */ + cpuinfo_uarch_xgene = 0x00B00100, + + /* Hygon Dhyana (a modification of AMD Zen for Chinese market). */ + cpuinfo_uarch_dhyana = 0x01000100, + + /** HiSilicon TaiShan v110 (Huawei Kunpeng 920 series processors). */ + cpuinfo_uarch_taishan_v110 = 0x00C00100, +}; + +struct cpuinfo_processor { + /** SMT (hyperthread) ID within a core */ + uint32_t smt_id; + /** Core containing this logical processor */ + const struct cpuinfo_core* core; + /** Cluster of cores containing this logical processor */ + const struct cpuinfo_cluster* cluster; + /** Physical package containing this logical processor */ + const struct cpuinfo_package* package; +#if defined(__linux__) + /** + * Linux-specific ID for the logical processor: + * - Linux kernel exposes information about this logical processor in /sys/devices/system/cpu/cpu/ + * - Bit in the cpu_set_t identifies this logical processor + */ + int linux_id; +#endif +#if defined(_WIN32) || defined(__CYGWIN__) + /** Windows-specific ID for the group containing the logical processor. */ + uint16_t windows_group_id; + /** + * Windows-specific ID of the logical processor within its group: + * - Bit in the KAFFINITY mask identifies this logical processor within its group. + */ + uint16_t windows_processor_id; +#endif +#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + /** APIC ID (unique x86-specific ID of the logical processor) */ + uint32_t apic_id; +#endif + struct { + /** Level 1 instruction cache */ + const struct cpuinfo_cache* l1i; + /** Level 1 data cache */ + const struct cpuinfo_cache* l1d; + /** Level 2 unified or data cache */ + const struct cpuinfo_cache* l2; + /** Level 3 unified or data cache */ + const struct cpuinfo_cache* l3; + /** Level 4 unified or data cache */ + const struct cpuinfo_cache* l4; + } cache; +}; + +struct cpuinfo_core { + /** Index of the first logical processor on this core. */ + uint32_t processor_start; + /** Number of logical processors on this core */ + uint32_t processor_count; + /** Core ID within a package */ + uint32_t core_id; + /** Cluster containing this core */ + const struct cpuinfo_cluster* cluster; + /** Physical package containing this core. */ + const struct cpuinfo_package* package; + /** Vendor of the CPU microarchitecture for this core */ + enum cpuinfo_vendor vendor; + /** CPU microarchitecture for this core */ + enum cpuinfo_uarch uarch; +#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + /** Value of CPUID leaf 1 EAX register for this core */ + uint32_t cpuid; +#elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 + /** Value of Main ID Register (MIDR) for this core */ + uint32_t midr; +#endif + /** Clock rate (non-Turbo) of the core, in Hz */ + uint64_t frequency; +}; + +struct cpuinfo_cluster { + /** Index of the first logical processor in the cluster */ + uint32_t processor_start; + /** Number of logical processors in the cluster */ + uint32_t processor_count; + /** Index of the first core in the cluster */ + uint32_t core_start; + /** Number of cores on the cluster */ + uint32_t core_count; + /** Cluster ID within a package */ + uint32_t cluster_id; + /** Physical package containing the cluster */ + const struct cpuinfo_package* package; + /** CPU microarchitecture vendor of the cores in the cluster */ + enum cpuinfo_vendor vendor; + /** CPU microarchitecture of the cores in the cluster */ + enum cpuinfo_uarch uarch; +#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + /** Value of CPUID leaf 1 EAX register of the cores in the cluster */ + uint32_t cpuid; +#elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 + /** Value of Main ID Register (MIDR) of the cores in the cluster */ + uint32_t midr; +#endif + /** Clock rate (non-Turbo) of the cores in the cluster, in Hz */ + uint64_t frequency; +}; + +#define CPUINFO_PACKAGE_NAME_MAX 48 + +struct cpuinfo_package { + /** SoC or processor chip model name */ + char name[CPUINFO_PACKAGE_NAME_MAX]; + /** Index of the first logical processor on this physical package */ + uint32_t processor_start; + /** Number of logical processors on this physical package */ + uint32_t processor_count; + /** Index of the first core on this physical package */ + uint32_t core_start; + /** Number of cores on this physical package */ + uint32_t core_count; + /** Index of the first cluster of cores on this physical package */ + uint32_t cluster_start; + /** Number of clusters of cores on this physical package */ + uint32_t cluster_count; +}; + +struct cpuinfo_uarch_info { + /** Type of CPU microarchitecture */ + enum cpuinfo_uarch uarch; +#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + /** Value of CPUID leaf 1 EAX register for the microarchitecture */ + uint32_t cpuid; +#elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 + /** Value of Main ID Register (MIDR) for the microarchitecture */ + uint32_t midr; +#endif + /** Number of logical processors with the microarchitecture */ + uint32_t processor_count; + /** Number of cores with the microarchitecture */ + uint32_t core_count; +}; + +#ifdef __cplusplus +extern "C" { +#endif + +bool CPUINFO_ABI cpuinfo_initialize(void); + +void CPUINFO_ABI cpuinfo_deinitialize(void); + +#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + /* This structure is not a part of stable API. Use cpuinfo_has_x86_* functions instead. */ + struct cpuinfo_x86_isa { + #if CPUINFO_ARCH_X86 + bool rdtsc; + #endif + bool rdtscp; + bool rdpid; + bool sysenter; + #if CPUINFO_ARCH_X86 + bool syscall; + #endif + bool msr; + bool clzero; + bool clflush; + bool clflushopt; + bool mwait; + bool mwaitx; + #if CPUINFO_ARCH_X86 + bool emmx; + #endif + bool fxsave; + bool xsave; + #if CPUINFO_ARCH_X86 + bool fpu; + bool mmx; + bool mmx_plus; + #endif + bool three_d_now; + bool three_d_now_plus; + #if CPUINFO_ARCH_X86 + bool three_d_now_geode; + #endif + bool prefetch; + bool prefetchw; + bool prefetchwt1; + #if CPUINFO_ARCH_X86 + bool daz; + bool sse; + bool sse2; + #endif + bool sse3; + bool ssse3; + bool sse4_1; + bool sse4_2; + bool sse4a; + bool misaligned_sse; + bool avx; + bool avxvnni; + bool fma3; + bool fma4; + bool xop; + bool f16c; + bool avx2; + bool avx512f; + bool avx512pf; + bool avx512er; + bool avx512cd; + bool avx512dq; + bool avx512bw; + bool avx512vl; + bool avx512ifma; + bool avx512vbmi; + bool avx512vbmi2; + bool avx512bitalg; + bool avx512vpopcntdq; + bool avx512vnni; + bool avx512bf16; + bool avx512fp16; + bool avx512vp2intersect; + bool avx512_4vnniw; + bool avx512_4fmaps; + bool hle; + bool rtm; + bool xtest; + bool mpx; + #if CPUINFO_ARCH_X86 + bool cmov; + bool cmpxchg8b; + #endif + bool cmpxchg16b; + bool clwb; + bool movbe; + #if CPUINFO_ARCH_X86_64 + bool lahf_sahf; + #endif + bool fs_gs_base; + bool lzcnt; + bool popcnt; + bool tbm; + bool bmi; + bool bmi2; + bool adx; + bool aes; + bool vaes; + bool pclmulqdq; + bool vpclmulqdq; + bool gfni; + bool rdrand; + bool rdseed; + bool sha; + bool rng; + bool ace; + bool ace2; + bool phe; + bool pmm; + bool lwp; + }; + + extern struct cpuinfo_x86_isa cpuinfo_isa; +#endif + +static inline bool cpuinfo_has_x86_rdtsc(void) { + #if CPUINFO_ARCH_X86_64 + return true; + #elif CPUINFO_ARCH_X86 + #if defined(__ANDROID__) + return true; + #else + return cpuinfo_isa.rdtsc; + #endif + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_rdtscp(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.rdtscp; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_rdpid(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.rdpid; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_clzero(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.clzero; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_mwait(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.mwait; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_mwaitx(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.mwaitx; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_fxsave(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.fxsave; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_xsave(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.xsave; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_fpu(void) { + #if CPUINFO_ARCH_X86_64 + return true; + #elif CPUINFO_ARCH_X86 + #if defined(__ANDROID__) + return true; + #else + return cpuinfo_isa.fpu; + #endif + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_mmx(void) { + #if CPUINFO_ARCH_X86_64 + return true; + #elif CPUINFO_ARCH_X86 + #if defined(__ANDROID__) + return true; + #else + return cpuinfo_isa.mmx; + #endif + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_mmx_plus(void) { + #if CPUINFO_ARCH_X86_64 + return true; + #elif CPUINFO_ARCH_X86 + #if defined(__ANDROID__) + return true; + #else + return cpuinfo_isa.mmx_plus; + #endif + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_3dnow(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.three_d_now; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_3dnow_plus(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.three_d_now_plus; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_3dnow_geode(void) { + #if CPUINFO_ARCH_X86_64 + return false; + #elif CPUINFO_ARCH_X86 + #if defined(__ANDROID__) + return false; + #else + return cpuinfo_isa.three_d_now_geode; + #endif + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_prefetch(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.prefetch; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_prefetchw(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.prefetchw; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_prefetchwt1(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.prefetchwt1; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_daz(void) { + #if CPUINFO_ARCH_X86_64 + return true; + #elif CPUINFO_ARCH_X86 + #if defined(__ANDROID__) + return true; + #else + return cpuinfo_isa.daz; + #endif + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_sse(void) { + #if CPUINFO_ARCH_X86_64 + return true; + #elif CPUINFO_ARCH_X86 + #if defined(__ANDROID__) + return true; + #else + return cpuinfo_isa.sse; + #endif + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_sse2(void) { + #if CPUINFO_ARCH_X86_64 + return true; + #elif CPUINFO_ARCH_X86 + #if defined(__ANDROID__) + return true; + #else + return cpuinfo_isa.sse2; + #endif + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_sse3(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + #if defined(__ANDROID__) + return true; + #else + return cpuinfo_isa.sse3; + #endif + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_ssse3(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + #if defined(__ANDROID__) + return true; + #else + return cpuinfo_isa.ssse3; + #endif + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_sse4_1(void) { + #if CPUINFO_ARCH_X86_64 + #if defined(__ANDROID__) + return true; + #else + return cpuinfo_isa.sse4_1; + #endif + #elif CPUINFO_ARCH_X86 + return cpuinfo_isa.sse4_1; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_sse4_2(void) { + #if CPUINFO_ARCH_X86_64 + #if defined(__ANDROID__) + return true; + #else + return cpuinfo_isa.sse4_2; + #endif + #elif CPUINFO_ARCH_X86 + return cpuinfo_isa.sse4_2; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_sse4a(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.sse4a; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_misaligned_sse(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.misaligned_sse; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_avx(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.avx; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_avxvnni(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.avxvnni; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_fma3(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.fma3; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_fma4(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.fma4; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_xop(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.xop; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_f16c(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.f16c; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_avx2(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.avx2; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_avx512f(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.avx512f; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_avx512pf(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.avx512pf; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_avx512er(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.avx512er; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_avx512cd(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.avx512cd; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_avx512dq(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.avx512dq; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_avx512bw(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.avx512bw; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_avx512vl(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.avx512vl; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_avx512ifma(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.avx512ifma; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_avx512vbmi(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.avx512vbmi; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_avx512vbmi2(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.avx512vbmi2; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_avx512bitalg(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.avx512bitalg; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_avx512vpopcntdq(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.avx512vpopcntdq; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_avx512vnni(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.avx512vnni; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_avx512bf16(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.avx512bf16; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_avx512fp16(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.avx512fp16; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_avx512vp2intersect(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.avx512vp2intersect; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_avx512_4vnniw(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.avx512_4vnniw; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_avx512_4fmaps(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.avx512_4fmaps; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_hle(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.hle; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_rtm(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.rtm; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_xtest(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.xtest; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_mpx(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.mpx; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_cmov(void) { + #if CPUINFO_ARCH_X86_64 + return true; + #elif CPUINFO_ARCH_X86 + return cpuinfo_isa.cmov; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_cmpxchg8b(void) { + #if CPUINFO_ARCH_X86_64 + return true; + #elif CPUINFO_ARCH_X86 + return cpuinfo_isa.cmpxchg8b; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_cmpxchg16b(void) { + #if CPUINFO_ARCH_X86_64 + return cpuinfo_isa.cmpxchg16b; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_clwb(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.clwb; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_movbe(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.movbe; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_lahf_sahf(void) { + #if CPUINFO_ARCH_X86 + return true; + #elif CPUINFO_ARCH_X86_64 + return cpuinfo_isa.lahf_sahf; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_lzcnt(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.lzcnt; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_popcnt(void) { + #if CPUINFO_ARCH_X86_64 + #if defined(__ANDROID__) + return true; + #else + return cpuinfo_isa.popcnt; + #endif + #elif CPUINFO_ARCH_X86 + return cpuinfo_isa.popcnt; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_tbm(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.tbm; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_bmi(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.bmi; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_bmi2(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.bmi2; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_adx(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.adx; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_aes(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.aes; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_vaes(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.vaes; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_pclmulqdq(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.pclmulqdq; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_vpclmulqdq(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.vpclmulqdq; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_gfni(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.gfni; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_rdrand(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.rdrand; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_rdseed(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.rdseed; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_x86_sha(void) { + #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 + return cpuinfo_isa.sha; + #else + return false; + #endif +} + +#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 + /* This structure is not a part of stable API. Use cpuinfo_has_arm_* functions instead. */ + struct cpuinfo_arm_isa { + #if CPUINFO_ARCH_ARM + bool thumb; + bool thumb2; + bool thumbee; + bool jazelle; + bool armv5e; + bool armv6; + bool armv6k; + bool armv7; + bool armv7mp; + bool armv8; + bool idiv; + + bool vfpv2; + bool vfpv3; + bool d32; + bool fp16; + bool fma; + + bool wmmx; + bool wmmx2; + bool neon; + #endif + #if CPUINFO_ARCH_ARM64 + bool atomics; + bool bf16; + bool sve; + bool sve2; + bool i8mm; + #endif + bool rdm; + bool fp16arith; + bool dot; + bool jscvt; + bool fcma; + bool fhm; + + bool aes; + bool sha1; + bool sha2; + bool pmull; + bool crc32; + }; + + extern struct cpuinfo_arm_isa cpuinfo_isa; +#endif + +static inline bool cpuinfo_has_arm_thumb(void) { + #if CPUINFO_ARCH_ARM + return cpuinfo_isa.thumb; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_thumb2(void) { + #if CPUINFO_ARCH_ARM + return cpuinfo_isa.thumb2; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_v5e(void) { + #if CPUINFO_ARCH_ARM + return cpuinfo_isa.armv5e; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_v6(void) { + #if CPUINFO_ARCH_ARM + return cpuinfo_isa.armv6; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_v6k(void) { + #if CPUINFO_ARCH_ARM + return cpuinfo_isa.armv6k; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_v7(void) { + #if CPUINFO_ARCH_ARM + return cpuinfo_isa.armv7; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_v7mp(void) { + #if CPUINFO_ARCH_ARM + return cpuinfo_isa.armv7mp; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_v8(void) { + #if CPUINFO_ARCH_ARM64 + return true; + #elif CPUINFO_ARCH_ARM + return cpuinfo_isa.armv8; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_idiv(void) { + #if CPUINFO_ARCH_ARM64 + return true; + #elif CPUINFO_ARCH_ARM + return cpuinfo_isa.idiv; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_vfpv2(void) { + #if CPUINFO_ARCH_ARM + return cpuinfo_isa.vfpv2; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_vfpv3(void) { + #if CPUINFO_ARCH_ARM64 + return true; + #elif CPUINFO_ARCH_ARM + return cpuinfo_isa.vfpv3; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_vfpv3_d32(void) { + #if CPUINFO_ARCH_ARM64 + return true; + #elif CPUINFO_ARCH_ARM + return cpuinfo_isa.vfpv3 && cpuinfo_isa.d32; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_vfpv3_fp16(void) { + #if CPUINFO_ARCH_ARM64 + return true; + #elif CPUINFO_ARCH_ARM + return cpuinfo_isa.vfpv3 && cpuinfo_isa.fp16; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_vfpv3_fp16_d32(void) { + #if CPUINFO_ARCH_ARM64 + return true; + #elif CPUINFO_ARCH_ARM + return cpuinfo_isa.vfpv3 && cpuinfo_isa.fp16 && cpuinfo_isa.d32; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_vfpv4(void) { + #if CPUINFO_ARCH_ARM64 + return true; + #elif CPUINFO_ARCH_ARM + return cpuinfo_isa.vfpv3 && cpuinfo_isa.fma; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_vfpv4_d32(void) { + #if CPUINFO_ARCH_ARM64 + return true; + #elif CPUINFO_ARCH_ARM + return cpuinfo_isa.vfpv3 && cpuinfo_isa.fma && cpuinfo_isa.d32; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_fp16_arith(void) { + #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 + return cpuinfo_isa.fp16arith; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_bf16(void) { + #if CPUINFO_ARCH_ARM64 + return cpuinfo_isa.bf16; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_wmmx(void) { + #if CPUINFO_ARCH_ARM + return cpuinfo_isa.wmmx; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_wmmx2(void) { + #if CPUINFO_ARCH_ARM + return cpuinfo_isa.wmmx2; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_neon(void) { + #if CPUINFO_ARCH_ARM64 + return true; + #elif CPUINFO_ARCH_ARM + return cpuinfo_isa.neon; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_neon_fp16(void) { + #if CPUINFO_ARCH_ARM64 + return true; + #elif CPUINFO_ARCH_ARM + return cpuinfo_isa.neon && cpuinfo_isa.fp16; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_neon_fma(void) { + #if CPUINFO_ARCH_ARM64 + return true; + #elif CPUINFO_ARCH_ARM + return cpuinfo_isa.neon && cpuinfo_isa.fma; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_neon_v8(void) { + #if CPUINFO_ARCH_ARM64 + return true; + #elif CPUINFO_ARCH_ARM + return cpuinfo_isa.neon && cpuinfo_isa.armv8; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_atomics(void) { + #if CPUINFO_ARCH_ARM64 + return cpuinfo_isa.atomics; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_neon_rdm(void) { + #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 + return cpuinfo_isa.rdm; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_neon_fp16_arith(void) { + #if CPUINFO_ARCH_ARM + return cpuinfo_isa.neon && cpuinfo_isa.fp16arith; + #elif CPUINFO_ARCH_ARM64 + return cpuinfo_isa.fp16arith; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_fhm(void) { + #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 + return cpuinfo_isa.fhm; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_neon_dot(void) { + #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 + return cpuinfo_isa.dot; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_neon_bf16(void) { + #if CPUINFO_ARCH_ARM64 + return cpuinfo_isa.bf16; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_jscvt(void) { + #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 + return cpuinfo_isa.jscvt; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_fcma(void) { + #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 + return cpuinfo_isa.fcma; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_i8mm(void) { + #if CPUINFO_ARCH_ARM64 + return cpuinfo_isa.i8mm; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_aes(void) { + #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 + return cpuinfo_isa.aes; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_sha1(void) { + #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 + return cpuinfo_isa.sha1; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_sha2(void) { + #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 + return cpuinfo_isa.sha2; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_pmull(void) { + #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 + return cpuinfo_isa.pmull; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_crc32(void) { + #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 + return cpuinfo_isa.crc32; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_sve(void) { + #if CPUINFO_ARCH_ARM64 + return cpuinfo_isa.sve; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_sve_bf16(void) { + #if CPUINFO_ARCH_ARM64 + return cpuinfo_isa.sve && cpuinfo_isa.bf16; + #else + return false; + #endif +} + +static inline bool cpuinfo_has_arm_sve2(void) { + #if CPUINFO_ARCH_ARM64 + return cpuinfo_isa.sve2; + #else + return false; + #endif +} + +const struct cpuinfo_processor* CPUINFO_ABI cpuinfo_get_processors(void); +const struct cpuinfo_core* CPUINFO_ABI cpuinfo_get_cores(void); +const struct cpuinfo_cluster* CPUINFO_ABI cpuinfo_get_clusters(void); +const struct cpuinfo_package* CPUINFO_ABI cpuinfo_get_packages(void); +const struct cpuinfo_uarch_info* CPUINFO_ABI cpuinfo_get_uarchs(void); +const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l1i_caches(void); +const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l1d_caches(void); +const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l2_caches(void); +const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l3_caches(void); +const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l4_caches(void); + +const struct cpuinfo_processor* CPUINFO_ABI cpuinfo_get_processor(uint32_t index); +const struct cpuinfo_core* CPUINFO_ABI cpuinfo_get_core(uint32_t index); +const struct cpuinfo_cluster* CPUINFO_ABI cpuinfo_get_cluster(uint32_t index); +const struct cpuinfo_package* CPUINFO_ABI cpuinfo_get_package(uint32_t index); +const struct cpuinfo_uarch_info* CPUINFO_ABI cpuinfo_get_uarch(uint32_t index); +const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l1i_cache(uint32_t index); +const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l1d_cache(uint32_t index); +const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l2_cache(uint32_t index); +const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l3_cache(uint32_t index); +const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l4_cache(uint32_t index); + +uint32_t CPUINFO_ABI cpuinfo_get_processors_count(void); +uint32_t CPUINFO_ABI cpuinfo_get_cores_count(void); +uint32_t CPUINFO_ABI cpuinfo_get_clusters_count(void); +uint32_t CPUINFO_ABI cpuinfo_get_packages_count(void); +uint32_t CPUINFO_ABI cpuinfo_get_uarchs_count(void); +uint32_t CPUINFO_ABI cpuinfo_get_l1i_caches_count(void); +uint32_t CPUINFO_ABI cpuinfo_get_l1d_caches_count(void); +uint32_t CPUINFO_ABI cpuinfo_get_l2_caches_count(void); +uint32_t CPUINFO_ABI cpuinfo_get_l3_caches_count(void); +uint32_t CPUINFO_ABI cpuinfo_get_l4_caches_count(void); + +/** + * Returns upper bound on cache size. + */ +uint32_t CPUINFO_ABI cpuinfo_get_max_cache_size(void); + +/** + * Identify the logical processor that executes the current thread. + * + * There is no guarantee that the thread will stay on the same logical processor for any time. + * Callers should treat the result as only a hint, and be prepared to handle NULL return value. + */ +const struct cpuinfo_processor* CPUINFO_ABI cpuinfo_get_current_processor(void); + +/** + * Identify the core that executes the current thread. + * + * There is no guarantee that the thread will stay on the same core for any time. + * Callers should treat the result as only a hint, and be prepared to handle NULL return value. + */ +const struct cpuinfo_core* CPUINFO_ABI cpuinfo_get_current_core(void); + +/** + * Identify the microarchitecture index of the core that executes the current thread. + * If the system does not support such identification, the function returns 0. + * + * There is no guarantee that the thread will stay on the same type of core for any time. + * Callers should treat the result as only a hint. + */ +uint32_t CPUINFO_ABI cpuinfo_get_current_uarch_index(void); + +/** + * Identify the microarchitecture index of the core that executes the current thread. + * If the system does not support such identification, the function returns the user-specified default value. + * + * There is no guarantee that the thread will stay on the same type of core for any time. + * Callers should treat the result as only a hint. + */ +uint32_t CPUINFO_ABI cpuinfo_get_current_uarch_index_with_default(uint32_t default_uarch_index); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* CPUINFO_H */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/dnnl_ocl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/dnnl_ocl.h new file mode 100644 index 0000000000000000000000000000000000000000..ad731150b28babe7bd5a911acd8de70c57e85254 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/dnnl_ocl.h @@ -0,0 +1,22 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef DNNL_OCL_H +#define DNNL_OCL_H + +#include "oneapi/dnnl/dnnl_ocl.h" + +#endif /* DNNL_OCL_H */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/dnnl_version.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/dnnl_version.h new file mode 100644 index 0000000000000000000000000000000000000000..32a3d5cf839b1d593f069520febfd60b323730e9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/dnnl_version.h @@ -0,0 +1,22 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef DNNL_VERSION_H +#define DNNL_VERSION_H + +#include "oneapi/dnnl/dnnl_version.h" + +#endif /* DNNL_VERSION_H */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/sleef.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/sleef.h new file mode 100644 index 0000000000000000000000000000000000000000..de36514f991a5f9b4774b232a1a6350c47c2c74c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/sleef.h @@ -0,0 +1,4459 @@ +// Copyright Naoki Shibata and contributors 2010 - 2020. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef __SLEEF_H__ +#define __SLEEF_H__ + +#define SLEEF_VERSION_MAJOR 3 +#define SLEEF_VERSION_MINOR 6 +#define SLEEF_VERSION_PATCHLEVEL 0 + +#include +#include + +#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) +#define CONST const +#else +#define CONST +#endif + +#if defined(__AVX2__) || defined(__aarch64__) || defined(__arm__) || defined(__powerpc64__) || defined(__zarch__) +#ifndef FP_FAST_FMA +#define FP_FAST_FMA +#endif +#ifndef FP_FAST_FMAF +#define FP_FAST_FMAF +#endif +#endif + +#if defined(_MSC_VER) && !defined(__STDC__) +#define __STDC__ 1 +#endif + +#if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS) +#ifdef IMPORT_IS_EXPORT +#define IMPORT __declspec(dllexport) +#else // #ifdef IMPORT_IS_EXPORT +#define IMPORT __declspec(dllimport) +#if (defined(_MSC_VER)) +#pragma comment(lib,"sleef.lib") +#endif // #if (defined(_MSC_VER)) +#endif // #ifdef IMPORT_IS_EXPORT +#else // #if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS) +#define IMPORT +#endif // #if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS) + +#if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__)) +#include +#endif + +#if (defined(_MSC_VER)) +#include +#endif + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#include +#endif + +#if defined(__ARM_FEATURE_SVE) +#include +#endif + +#if defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__) +#include +typedef __vector double SLEEF_VECTOR_DOUBLE; +typedef __vector float SLEEF_VECTOR_FLOAT; +typedef __vector int SLEEF_VECTOR_INT; +typedef __vector unsigned int SLEEF_VECTOR_UINT; +typedef __vector long long SLEEF_VECTOR_LONGLONG; +typedef __vector unsigned long long SLEEF_VECTOR_ULONGLONG; +#endif + +#if defined(__VX__) && defined(__VEC__) +#ifndef SLEEF_VECINTRIN_H_INCLUDED +#include +#define SLEEF_VECINTRIN_H_INCLUDED +#endif +typedef __vector double SLEEF_VECTOR_DOUBLE; +typedef __vector float SLEEF_VECTOR_FLOAT; +typedef __vector int SLEEF_VECTOR_INT; +typedef __vector unsigned int SLEEF_VECTOR_UINT; +typedef __vector long long SLEEF_VECTOR_LONGLONG; +typedef __vector unsigned long long SLEEF_VECTOR_ULONGLONG; +#endif + +// + +#ifndef SLEEF_FP_ILOGB0 +#define SLEEF_FP_ILOGB0 ((int)-2147483648) +#endif + +#ifndef SLEEF_FP_ILOGBNAN +#define SLEEF_FP_ILOGBNAN ((int)2147483647) +#endif + +// + +IMPORT void *Sleef_malloc(size_t z); +IMPORT void Sleef_free(void *ptr); +IMPORT uint64_t Sleef_currentTimeMicros(); + +#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) +IMPORT void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx); +#endif + +// + +#ifndef Sleef_double2_DEFINED +#define Sleef_double2_DEFINED +typedef struct { + double x, y; +} Sleef_double2; +#endif + +#ifndef Sleef_float2_DEFINED +#define Sleef_float2_DEFINED +typedef struct { + float x, y; +} Sleef_float2; +#endif + +#ifndef Sleef_longdouble2_DEFINED +#define Sleef_longdouble2_DEFINED +typedef struct { + long double x, y; +} Sleef_longdouble2; +#endif + +#if !defined(Sleef_quad_DEFINED) +#define Sleef_quad_DEFINED +#if defined(__SIZEOF_FLOAT128__) || (defined(__linux__) && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(__PPC64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 8) +typedef __float128 Sleef_quad; +#define SLEEF_QUAD_C(x) (x ## Q) +//#elif defined(__SIZEOF_LONG_DOUBLE__) && defined(__aarch64__) +//typedef long double Sleef_quad; +//#define SLEEF_QUAD_C(x) (x ## L) +#else +typedef struct { uint64_t x, y; } Sleef_quad; +#endif +#endif + +#if !defined(Sleef_quad2_DEFINED) +#define Sleef_quad2_DEFINED +typedef union { + struct { + Sleef_quad x, y; + }; + Sleef_quad s[2]; +} Sleef_quad2; +#endif + +#ifdef __cplusplus +extern "C" +{ +#endif + +IMPORT CONST double Sleef_sin_u35(double); +IMPORT CONST double Sleef_cos_u35(double); +IMPORT CONST Sleef_double2 Sleef_sincos_u35(double); +IMPORT CONST double Sleef_tan_u35(double); +IMPORT CONST double Sleef_asin_u35(double); +IMPORT CONST double Sleef_acos_u35(double); +IMPORT CONST double Sleef_atan_u35(double); +IMPORT CONST double Sleef_atan2_u35(double, double); +IMPORT CONST double Sleef_log_u35(double); +IMPORT CONST double Sleef_cbrt_u35(double); +IMPORT CONST double Sleef_sin_u10(double); +IMPORT CONST double Sleef_cos_u10(double); +IMPORT CONST Sleef_double2 Sleef_sincos_u10(double); +IMPORT CONST double Sleef_tan_u10(double); +IMPORT CONST double Sleef_asin_u10(double); +IMPORT CONST double Sleef_acos_u10(double); +IMPORT CONST double Sleef_atan_u10(double); +IMPORT CONST double Sleef_atan2_u10(double, double); +IMPORT CONST double Sleef_log_u10(double); +IMPORT CONST double Sleef_cbrt_u10(double); +IMPORT CONST double Sleef_exp_u10(double); +IMPORT CONST double Sleef_pow_u10(double, double); +IMPORT CONST double Sleef_sinh_u10(double); +IMPORT CONST double Sleef_cosh_u10(double); +IMPORT CONST double Sleef_tanh_u10(double); +IMPORT CONST double Sleef_sinh_u35(double); +IMPORT CONST double Sleef_cosh_u35(double); +IMPORT CONST double Sleef_tanh_u35(double); +IMPORT CONST double Sleef_asinh_u10(double); +IMPORT CONST double Sleef_acosh_u10(double); +IMPORT CONST double Sleef_atanh_u10(double); +IMPORT CONST double Sleef_exp2_u10(double); +IMPORT CONST double Sleef_exp10_u10(double); +IMPORT CONST double Sleef_exp2_u35(double); +IMPORT CONST double Sleef_exp10_u35(double); +IMPORT CONST double Sleef_expm1_u10(double); +IMPORT CONST double Sleef_log10_u10(double); +IMPORT CONST double Sleef_log2_u10(double); +IMPORT CONST double Sleef_log2_u35(double); +IMPORT CONST double Sleef_log1p_u10(double); +IMPORT CONST Sleef_double2 Sleef_sincospi_u05(double); +IMPORT CONST Sleef_double2 Sleef_sincospi_u35(double); +IMPORT CONST double Sleef_sinpi_u05(double); +IMPORT CONST double Sleef_cospi_u05(double); +IMPORT CONST double Sleef_ldexp(double, int); +IMPORT CONST int Sleef_ilogb(double); +IMPORT CONST double Sleef_fma(double, double, double); +IMPORT CONST double Sleef_sqrt(double); +IMPORT CONST double Sleef_sqrt_u05(double); +IMPORT CONST double Sleef_sqrt_u35(double); + +IMPORT CONST double Sleef_hypot_u05(double, double); +IMPORT CONST double Sleef_hypot_u35(double, double); + +IMPORT CONST double Sleef_fabs(double); +IMPORT CONST double Sleef_copysign(double, double); +IMPORT CONST double Sleef_fmax(double, double); +IMPORT CONST double Sleef_fmin(double, double); +IMPORT CONST double Sleef_fdim(double, double); +IMPORT CONST double Sleef_trunc(double); +IMPORT CONST double Sleef_floor(double); +IMPORT CONST double Sleef_ceil(double); +IMPORT CONST double Sleef_round(double); +IMPORT CONST double Sleef_rint(double); +IMPORT CONST double Sleef_nextafter(double, double); +IMPORT CONST double Sleef_frfrexp(double); +IMPORT CONST int Sleef_expfrexp(double); +IMPORT CONST double Sleef_fmod(double, double); +IMPORT CONST double Sleef_remainder(double, double); +IMPORT CONST Sleef_double2 Sleef_modf(double); + +IMPORT CONST double Sleef_lgamma_u10(double); +IMPORT CONST double Sleef_tgamma_u10(double); +IMPORT CONST double Sleef_erf_u10(double); +IMPORT CONST double Sleef_erfc_u15(double); + +IMPORT CONST float Sleef_sinf_u35(float); +IMPORT CONST float Sleef_cosf_u35(float); +IMPORT CONST Sleef_float2 Sleef_sincosf_u35(float); +IMPORT CONST float Sleef_tanf_u35(float); +IMPORT CONST float Sleef_asinf_u35(float); +IMPORT CONST float Sleef_acosf_u35(float); +IMPORT CONST float Sleef_atanf_u35(float); +IMPORT CONST float Sleef_atan2f_u35(float, float); +IMPORT CONST float Sleef_logf_u35(float); +IMPORT CONST float Sleef_cbrtf_u35(float); +IMPORT CONST float Sleef_sinf_u10(float); +IMPORT CONST float Sleef_cosf_u10(float); +IMPORT CONST Sleef_float2 Sleef_sincosf_u10(float); +IMPORT CONST float Sleef_fastsinf_u3500(float); +IMPORT CONST float Sleef_fastcosf_u3500(float); +IMPORT CONST float Sleef_tanf_u10(float); +IMPORT CONST float Sleef_asinf_u10(float); +IMPORT CONST float Sleef_acosf_u10(float); +IMPORT CONST float Sleef_atanf_u10(float); +IMPORT CONST float Sleef_atan2f_u10(float, float); +IMPORT CONST float Sleef_logf_u10(float); +IMPORT CONST float Sleef_cbrtf_u10(float); +IMPORT CONST float Sleef_expf_u10(float); +IMPORT CONST float Sleef_powf_u10(float, float); +IMPORT CONST float Sleef_fastpowf_u3500(float, float); +IMPORT CONST float Sleef_sinhf_u10(float); +IMPORT CONST float Sleef_coshf_u10(float); +IMPORT CONST float Sleef_tanhf_u10(float); +IMPORT CONST float Sleef_sinhf_u35(float); +IMPORT CONST float Sleef_coshf_u35(float); +IMPORT CONST float Sleef_tanhf_u35(float); +IMPORT CONST float Sleef_asinhf_u10(float); +IMPORT CONST float Sleef_acoshf_u10(float); +IMPORT CONST float Sleef_atanhf_u10(float); +IMPORT CONST float Sleef_exp2f_u10(float); +IMPORT CONST float Sleef_exp10f_u10(float); +IMPORT CONST float Sleef_exp2f_u35(float); +IMPORT CONST float Sleef_exp10f_u35(float); +IMPORT CONST float Sleef_expm1f_u10(float); +IMPORT CONST float Sleef_log10f_u10(float); +IMPORT CONST float Sleef_log2f_u10(float); +IMPORT CONST float Sleef_log2f_u35(float); +IMPORT CONST float Sleef_log1pf_u10(float); +IMPORT CONST Sleef_float2 Sleef_sincospif_u05(float); +IMPORT CONST Sleef_float2 Sleef_sincospif_u35(float); +IMPORT CONST float Sleef_sinpif_u05(float d); +IMPORT CONST float Sleef_cospif_u05(float d); +IMPORT CONST float Sleef_ldexpf(float, int); +IMPORT CONST int Sleef_ilogbf(float); +IMPORT CONST float Sleef_fmaf(float, float, float); +IMPORT CONST float Sleef_sqrtf(float); +IMPORT CONST float Sleef_sqrtf_u05(float); +IMPORT CONST float Sleef_sqrtf_u35(float); + +IMPORT CONST float Sleef_hypotf_u05(float, float); +IMPORT CONST float Sleef_hypotf_u35(float, float); + +IMPORT CONST float Sleef_fabsf(float); +IMPORT CONST float Sleef_copysignf(float, float); +IMPORT CONST float Sleef_fmaxf(float, float); +IMPORT CONST float Sleef_fminf(float, float); +IMPORT CONST float Sleef_fdimf(float, float); +IMPORT CONST float Sleef_truncf(float); +IMPORT CONST float Sleef_floorf(float); +IMPORT CONST float Sleef_ceilf(float); +IMPORT CONST float Sleef_roundf(float); +IMPORT CONST float Sleef_rintf(float); +IMPORT CONST float Sleef_nextafterf(float, float); +IMPORT CONST float Sleef_frfrexpf(float); +IMPORT CONST int Sleef_expfrexpf(float); +IMPORT CONST float Sleef_fmodf(float, float); +IMPORT CONST float Sleef_remainderf(float, float); +IMPORT CONST Sleef_float2 Sleef_modff(float); + +IMPORT CONST float Sleef_lgammaf_u10(float); +IMPORT CONST float Sleef_tgammaf_u10(float); +IMPORT CONST float Sleef_erff_u10(float); +IMPORT CONST float Sleef_erfcf_u15(float); + +IMPORT CONST Sleef_longdouble2 Sleef_sincospil_u05(long double); +IMPORT CONST Sleef_longdouble2 Sleef_sincospil_u35(long double); + +#if defined(Sleef_quad2_DEFINED) +IMPORT CONST Sleef_quad2 Sleef_sincospiq_u05(Sleef_quad); +IMPORT CONST Sleef_quad2 Sleef_sincospiq_u35(Sleef_quad); +#endif +#ifdef __SSE2__ + +#ifndef Sleef___m128d_2_DEFINED +typedef struct { + __m128d x, y; +} Sleef___m128d_2; +#define Sleef___m128d_2_DEFINED +#endif + +IMPORT CONST __m128d Sleef_sind2_u35(__m128d); +IMPORT CONST __m128d Sleef_cinz_sind2_u35(__m128d); +IMPORT CONST __m128d Sleef_cosd2_u35(__m128d); +IMPORT CONST __m128d Sleef_cinz_cosd2_u35(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_sincosd2_u35(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincosd2_u35(__m128d); +IMPORT CONST __m128d Sleef_tand2_u35(__m128d); +IMPORT CONST __m128d Sleef_cinz_tand2_u35(__m128d); +IMPORT CONST __m128d Sleef_asind2_u35(__m128d); +IMPORT CONST __m128d Sleef_cinz_asind2_u35(__m128d); +IMPORT CONST __m128d Sleef_acosd2_u35(__m128d); +IMPORT CONST __m128d Sleef_cinz_acosd2_u35(__m128d); +IMPORT CONST __m128d Sleef_atand2_u35(__m128d); +IMPORT CONST __m128d Sleef_cinz_atand2_u35(__m128d); +IMPORT CONST __m128d Sleef_atan2d2_u35(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_atan2d2_u35(__m128d, __m128d); +IMPORT CONST __m128d Sleef_logd2_u35(__m128d); +IMPORT CONST __m128d Sleef_cinz_logd2_u35(__m128d); +IMPORT CONST __m128d Sleef_cbrtd2_u35(__m128d); +IMPORT CONST __m128d Sleef_cinz_cbrtd2_u35(__m128d); +IMPORT CONST __m128d Sleef_sind2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_sind2_u10(__m128d); +IMPORT CONST __m128d Sleef_cosd2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_cosd2_u10(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_sincosd2_u10(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincosd2_u10(__m128d); +IMPORT CONST __m128d Sleef_tand2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_tand2_u10(__m128d); +IMPORT CONST __m128d Sleef_asind2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_asind2_u10(__m128d); +IMPORT CONST __m128d Sleef_acosd2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_acosd2_u10(__m128d); +IMPORT CONST __m128d Sleef_atand2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_atand2_u10(__m128d); +IMPORT CONST __m128d Sleef_atan2d2_u10(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_atan2d2_u10(__m128d, __m128d); +IMPORT CONST __m128d Sleef_logd2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_logd2_u10(__m128d); +IMPORT CONST __m128d Sleef_cbrtd2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_cbrtd2_u10(__m128d); +IMPORT CONST __m128d Sleef_expd2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_expd2_u10(__m128d); +IMPORT CONST __m128d Sleef_powd2_u10(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_powd2_u10(__m128d, __m128d); +IMPORT CONST __m128d Sleef_sinhd2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_sinhd2_u10(__m128d); +IMPORT CONST __m128d Sleef_coshd2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_coshd2_u10(__m128d); +IMPORT CONST __m128d Sleef_tanhd2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_tanhd2_u10(__m128d); +IMPORT CONST __m128d Sleef_sinhd2_u35(__m128d); +IMPORT CONST __m128d Sleef_cinz_sinhd2_u35(__m128d); +IMPORT CONST __m128d Sleef_coshd2_u35(__m128d); +IMPORT CONST __m128d Sleef_cinz_coshd2_u35(__m128d); +IMPORT CONST __m128d Sleef_tanhd2_u35(__m128d); +IMPORT CONST __m128d Sleef_cinz_tanhd2_u35(__m128d); +IMPORT CONST __m128d Sleef_fastsind2_u3500(__m128d); +IMPORT CONST __m128d Sleef_cinz_fastsind2_u3500(__m128d); +IMPORT CONST __m128d Sleef_fastcosd2_u3500(__m128d); +IMPORT CONST __m128d Sleef_cinz_fastcosd2_u3500(__m128d); +IMPORT CONST __m128d Sleef_fastpowd2_u3500(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_fastpowd2_u3500(__m128d, __m128d); +IMPORT CONST __m128d Sleef_asinhd2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_asinhd2_u10(__m128d); +IMPORT CONST __m128d Sleef_acoshd2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_acoshd2_u10(__m128d); +IMPORT CONST __m128d Sleef_atanhd2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_atanhd2_u10(__m128d); +IMPORT CONST __m128d Sleef_exp2d2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_exp2d2_u10(__m128d); +IMPORT CONST __m128d Sleef_exp2d2_u35(__m128d); +IMPORT CONST __m128d Sleef_cinz_exp2d2_u35(__m128d); +IMPORT CONST __m128d Sleef_exp10d2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_exp10d2_u10(__m128d); +IMPORT CONST __m128d Sleef_exp10d2_u35(__m128d); +IMPORT CONST __m128d Sleef_cinz_exp10d2_u35(__m128d); +IMPORT CONST __m128d Sleef_expm1d2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_expm1d2_u10(__m128d); +IMPORT CONST __m128d Sleef_log10d2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_log10d2_u10(__m128d); +IMPORT CONST __m128d Sleef_log2d2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_log2d2_u10(__m128d); +IMPORT CONST __m128d Sleef_log2d2_u35(__m128d); +IMPORT CONST __m128d Sleef_cinz_log2d2_u35(__m128d); +IMPORT CONST __m128d Sleef_log1pd2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_log1pd2_u10(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_sincospid2_u05(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincospid2_u05(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_sincospid2_u35(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincospid2_u35(__m128d); +IMPORT CONST __m128d Sleef_sinpid2_u05(__m128d); +IMPORT CONST __m128d Sleef_cinz_sinpid2_u05(__m128d); +IMPORT CONST __m128d Sleef_cospid2_u05(__m128d); +IMPORT CONST __m128d Sleef_cinz_cospid2_u05(__m128d); +IMPORT CONST __m128d Sleef_ldexpd2(__m128d, __m128i); +IMPORT CONST __m128d Sleef_cinz_ldexpd2(__m128d, __m128i); +IMPORT CONST __m128i Sleef_ilogbd2(__m128d); +IMPORT CONST __m128i Sleef_cinz_ilogbd2(__m128d); +IMPORT CONST __m128d Sleef_fmad2(__m128d, __m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_fmad2(__m128d, __m128d, __m128d); +IMPORT CONST __m128d Sleef_sqrtd2(__m128d); +IMPORT CONST __m128d Sleef_cinz_sqrtd2(__m128d); +IMPORT CONST __m128d Sleef_sqrtd2_u05(__m128d); +IMPORT CONST __m128d Sleef_cinz_sqrtd2_u05(__m128d); +IMPORT CONST __m128d Sleef_sqrtd2_u35(__m128d); +IMPORT CONST __m128d Sleef_cinz_sqrtd2_u35(__m128d); +IMPORT CONST __m128d Sleef_hypotd2_u05(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_hypotd2_u05(__m128d, __m128d); +IMPORT CONST __m128d Sleef_hypotd2_u35(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_hypotd2_u35(__m128d, __m128d); +IMPORT CONST __m128d Sleef_fabsd2(__m128d); +IMPORT CONST __m128d Sleef_cinz_fabsd2(__m128d); +IMPORT CONST __m128d Sleef_copysignd2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_copysignd2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_fmaxd2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_fmaxd2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_fmind2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_fmind2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_fdimd2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_fdimd2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_truncd2(__m128d); +IMPORT CONST __m128d Sleef_cinz_truncd2(__m128d); +IMPORT CONST __m128d Sleef_floord2(__m128d); +IMPORT CONST __m128d Sleef_cinz_floord2(__m128d); +IMPORT CONST __m128d Sleef_ceild2(__m128d); +IMPORT CONST __m128d Sleef_cinz_ceild2(__m128d); +IMPORT CONST __m128d Sleef_roundd2(__m128d); +IMPORT CONST __m128d Sleef_cinz_roundd2(__m128d); +IMPORT CONST __m128d Sleef_rintd2(__m128d); +IMPORT CONST __m128d Sleef_cinz_rintd2(__m128d); +IMPORT CONST __m128d Sleef_nextafterd2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_nextafterd2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_frfrexpd2(__m128d); +IMPORT CONST __m128d Sleef_cinz_frfrexpd2(__m128d); +IMPORT CONST __m128i Sleef_expfrexpd2(__m128d); +IMPORT CONST __m128i Sleef_cinz_expfrexpd2(__m128d); +IMPORT CONST __m128d Sleef_fmodd2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_fmodd2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_remainderd2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_remainderd2(__m128d, __m128d); +IMPORT CONST Sleef___m128d_2 Sleef_modfd2(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_cinz_modfd2(__m128d); +IMPORT CONST __m128d Sleef_lgammad2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_lgammad2_u10(__m128d); +IMPORT CONST __m128d Sleef_tgammad2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_tgammad2_u10(__m128d); +IMPORT CONST __m128d Sleef_erfd2_u10(__m128d); +IMPORT CONST __m128d Sleef_cinz_erfd2_u10(__m128d); +IMPORT CONST __m128d Sleef_erfcd2_u15(__m128d); +IMPORT CONST __m128d Sleef_cinz_erfcd2_u15(__m128d); +IMPORT CONST int Sleef_getIntd2(int); +IMPORT CONST void *Sleef_getPtrd2(int); + +#ifndef Sleef___m128_2_DEFINED +typedef struct { + __m128 x, y; +} Sleef___m128_2; +#define Sleef___m128_2_DEFINED +#endif + +IMPORT CONST __m128 Sleef_sinf4_u35(__m128); +IMPORT CONST __m128 Sleef_cinz_sinf4_u35(__m128); +IMPORT CONST __m128 Sleef_cosf4_u35(__m128); +IMPORT CONST __m128 Sleef_cinz_cosf4_u35(__m128); +IMPORT CONST Sleef___m128_2 Sleef_sincosf4_u35(__m128); +IMPORT CONST Sleef___m128_2 Sleef_cinz_sincosf4_u35(__m128); +IMPORT CONST __m128 Sleef_tanf4_u35(__m128); +IMPORT CONST __m128 Sleef_cinz_tanf4_u35(__m128); +IMPORT CONST __m128 Sleef_asinf4_u35(__m128); +IMPORT CONST __m128 Sleef_cinz_asinf4_u35(__m128); +IMPORT CONST __m128 Sleef_acosf4_u35(__m128); +IMPORT CONST __m128 Sleef_cinz_acosf4_u35(__m128); +IMPORT CONST __m128 Sleef_atanf4_u35(__m128); +IMPORT CONST __m128 Sleef_cinz_atanf4_u35(__m128); +IMPORT CONST __m128 Sleef_atan2f4_u35(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_atan2f4_u35(__m128, __m128); +IMPORT CONST __m128 Sleef_logf4_u35(__m128); +IMPORT CONST __m128 Sleef_cinz_logf4_u35(__m128); +IMPORT CONST __m128 Sleef_cbrtf4_u35(__m128); +IMPORT CONST __m128 Sleef_cinz_cbrtf4_u35(__m128); +IMPORT CONST __m128 Sleef_sinf4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_sinf4_u10(__m128); +IMPORT CONST __m128 Sleef_cosf4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_cosf4_u10(__m128); +IMPORT CONST Sleef___m128_2 Sleef_sincosf4_u10(__m128); +IMPORT CONST Sleef___m128_2 Sleef_cinz_sincosf4_u10(__m128); +IMPORT CONST __m128 Sleef_tanf4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_tanf4_u10(__m128); +IMPORT CONST __m128 Sleef_asinf4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_asinf4_u10(__m128); +IMPORT CONST __m128 Sleef_acosf4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_acosf4_u10(__m128); +IMPORT CONST __m128 Sleef_atanf4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_atanf4_u10(__m128); +IMPORT CONST __m128 Sleef_atan2f4_u10(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_atan2f4_u10(__m128, __m128); +IMPORT CONST __m128 Sleef_logf4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_logf4_u10(__m128); +IMPORT CONST __m128 Sleef_cbrtf4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_cbrtf4_u10(__m128); +IMPORT CONST __m128 Sleef_expf4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_expf4_u10(__m128); +IMPORT CONST __m128 Sleef_powf4_u10(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_powf4_u10(__m128, __m128); +IMPORT CONST __m128 Sleef_sinhf4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_sinhf4_u10(__m128); +IMPORT CONST __m128 Sleef_coshf4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_coshf4_u10(__m128); +IMPORT CONST __m128 Sleef_tanhf4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_tanhf4_u10(__m128); +IMPORT CONST __m128 Sleef_sinhf4_u35(__m128); +IMPORT CONST __m128 Sleef_cinz_sinhf4_u35(__m128); +IMPORT CONST __m128 Sleef_coshf4_u35(__m128); +IMPORT CONST __m128 Sleef_cinz_coshf4_u35(__m128); +IMPORT CONST __m128 Sleef_tanhf4_u35(__m128); +IMPORT CONST __m128 Sleef_cinz_tanhf4_u35(__m128); +IMPORT CONST __m128 Sleef_fastsinf4_u3500(__m128); +IMPORT CONST __m128 Sleef_cinz_fastsinf4_u3500(__m128); +IMPORT CONST __m128 Sleef_fastcosf4_u3500(__m128); +IMPORT CONST __m128 Sleef_cinz_fastcosf4_u3500(__m128); +IMPORT CONST __m128 Sleef_fastpowf4_u3500(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_fastpowf4_u3500(__m128, __m128); +IMPORT CONST __m128 Sleef_asinhf4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_asinhf4_u10(__m128); +IMPORT CONST __m128 Sleef_acoshf4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_acoshf4_u10(__m128); +IMPORT CONST __m128 Sleef_atanhf4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_atanhf4_u10(__m128); +IMPORT CONST __m128 Sleef_exp2f4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_exp2f4_u10(__m128); +IMPORT CONST __m128 Sleef_exp2f4_u35(__m128); +IMPORT CONST __m128 Sleef_cinz_exp2f4_u35(__m128); +IMPORT CONST __m128 Sleef_exp10f4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_exp10f4_u10(__m128); +IMPORT CONST __m128 Sleef_exp10f4_u35(__m128); +IMPORT CONST __m128 Sleef_cinz_exp10f4_u35(__m128); +IMPORT CONST __m128 Sleef_expm1f4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_expm1f4_u10(__m128); +IMPORT CONST __m128 Sleef_log10f4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_log10f4_u10(__m128); +IMPORT CONST __m128 Sleef_log2f4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_log2f4_u10(__m128); +IMPORT CONST __m128 Sleef_log2f4_u35(__m128); +IMPORT CONST __m128 Sleef_cinz_log2f4_u35(__m128); +IMPORT CONST __m128 Sleef_log1pf4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_log1pf4_u10(__m128); +IMPORT CONST Sleef___m128_2 Sleef_sincospif4_u05(__m128); +IMPORT CONST Sleef___m128_2 Sleef_cinz_sincospif4_u05(__m128); +IMPORT CONST Sleef___m128_2 Sleef_sincospif4_u35(__m128); +IMPORT CONST Sleef___m128_2 Sleef_cinz_sincospif4_u35(__m128); +IMPORT CONST __m128 Sleef_sinpif4_u05(__m128); +IMPORT CONST __m128 Sleef_cinz_sinpif4_u05(__m128); +IMPORT CONST __m128 Sleef_cospif4_u05(__m128); +IMPORT CONST __m128 Sleef_cinz_cospif4_u05(__m128); +IMPORT CONST __m128 Sleef_fmaf4(__m128, __m128, __m128); +IMPORT CONST __m128 Sleef_cinz_fmaf4(__m128, __m128, __m128); +IMPORT CONST __m128 Sleef_sqrtf4(__m128); +IMPORT CONST __m128 Sleef_cinz_sqrtf4(__m128); +IMPORT CONST __m128 Sleef_sqrtf4_u05(__m128); +IMPORT CONST __m128 Sleef_cinz_sqrtf4_u05(__m128); +IMPORT CONST __m128 Sleef_sqrtf4_u35(__m128); +IMPORT CONST __m128 Sleef_cinz_sqrtf4_u35(__m128); +IMPORT CONST __m128 Sleef_hypotf4_u05(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_hypotf4_u05(__m128, __m128); +IMPORT CONST __m128 Sleef_hypotf4_u35(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_hypotf4_u35(__m128, __m128); +IMPORT CONST __m128 Sleef_fabsf4(__m128); +IMPORT CONST __m128 Sleef_cinz_fabsf4(__m128); +IMPORT CONST __m128 Sleef_copysignf4(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_copysignf4(__m128, __m128); +IMPORT CONST __m128 Sleef_fmaxf4(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_fmaxf4(__m128, __m128); +IMPORT CONST __m128 Sleef_fminf4(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_fminf4(__m128, __m128); +IMPORT CONST __m128 Sleef_fdimf4(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_fdimf4(__m128, __m128); +IMPORT CONST __m128 Sleef_truncf4(__m128); +IMPORT CONST __m128 Sleef_cinz_truncf4(__m128); +IMPORT CONST __m128 Sleef_floorf4(__m128); +IMPORT CONST __m128 Sleef_cinz_floorf4(__m128); +IMPORT CONST __m128 Sleef_ceilf4(__m128); +IMPORT CONST __m128 Sleef_cinz_ceilf4(__m128); +IMPORT CONST __m128 Sleef_roundf4(__m128); +IMPORT CONST __m128 Sleef_cinz_roundf4(__m128); +IMPORT CONST __m128 Sleef_rintf4(__m128); +IMPORT CONST __m128 Sleef_cinz_rintf4(__m128); +IMPORT CONST __m128 Sleef_nextafterf4(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_nextafterf4(__m128, __m128); +IMPORT CONST __m128 Sleef_frfrexpf4(__m128); +IMPORT CONST __m128 Sleef_cinz_frfrexpf4(__m128); +IMPORT CONST __m128 Sleef_fmodf4(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_fmodf4(__m128, __m128); +IMPORT CONST __m128 Sleef_remainderf4(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_remainderf4(__m128, __m128); +IMPORT CONST Sleef___m128_2 Sleef_modff4(__m128); +IMPORT CONST Sleef___m128_2 Sleef_cinz_modff4(__m128); +IMPORT CONST __m128 Sleef_lgammaf4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_lgammaf4_u10(__m128); +IMPORT CONST __m128 Sleef_tgammaf4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_tgammaf4_u10(__m128); +IMPORT CONST __m128 Sleef_erff4_u10(__m128); +IMPORT CONST __m128 Sleef_cinz_erff4_u10(__m128); +IMPORT CONST __m128 Sleef_erfcf4_u15(__m128); +IMPORT CONST __m128 Sleef_cinz_erfcf4_u15(__m128); +IMPORT CONST int Sleef_getIntf4(int); +IMPORT CONST int Sleef_cinz_getIntf4(int); +IMPORT CONST void *Sleef_getPtrf4(int); +IMPORT CONST void *Sleef_cinz_getPtrf4(int); +#endif +#ifdef __SSE2__ + +#ifndef Sleef___m128d_2_DEFINED +typedef struct { + __m128d x, y; +} Sleef___m128d_2; +#define Sleef___m128d_2_DEFINED +#endif + +IMPORT CONST __m128d Sleef_sind2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_sind2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_cosd2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_cosd2_u35sse2(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_sincosd2_u35sse2(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincosd2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_tand2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_tand2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_asind2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_asind2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_acosd2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_acosd2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_atand2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_atand2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_atan2d2_u35sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_atan2d2_u35sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_logd2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_logd2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_cbrtd2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_cbrtd2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_sind2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_sind2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cosd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_cosd2_u10sse2(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_sincosd2_u10sse2(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincosd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_tand2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_tand2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_asind2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_asind2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_acosd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_acosd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_atand2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_atand2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_atan2d2_u10sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_atan2d2_u10sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_logd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_logd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cbrtd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_cbrtd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_expd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_expd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_powd2_u10sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_powd2_u10sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_sinhd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_sinhd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_coshd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_coshd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_tanhd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_tanhd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_sinhd2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_sinhd2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_coshd2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_coshd2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_tanhd2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_tanhd2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_fastsind2_u3500sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_fastsind2_u3500sse2(__m128d); +IMPORT CONST __m128d Sleef_fastcosd2_u3500sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_fastcosd2_u3500sse2(__m128d); +IMPORT CONST __m128d Sleef_fastpowd2_u3500sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_fastpowd2_u3500sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_asinhd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_asinhd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_acoshd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_acoshd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_atanhd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_atanhd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_exp2d2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_exp2d2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_exp2d2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_exp2d2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_exp10d2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_exp10d2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_exp10d2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_exp10d2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_expm1d2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_expm1d2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_log10d2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_log10d2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_log2d2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_log2d2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_log2d2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_log2d2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_log1pd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_log1pd2_u10sse2(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_sincospid2_u05sse2(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincospid2_u05sse2(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_sincospid2_u35sse2(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincospid2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_sinpid2_u05sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_sinpid2_u05sse2(__m128d); +IMPORT CONST __m128d Sleef_cospid2_u05sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_cospid2_u05sse2(__m128d); +IMPORT CONST __m128d Sleef_ldexpd2_sse2(__m128d, __m128i); +IMPORT CONST __m128d Sleef_cinz_ldexpd2_sse2(__m128d, __m128i); +IMPORT CONST __m128i Sleef_ilogbd2_sse2(__m128d); +IMPORT CONST __m128i Sleef_cinz_ilogbd2_sse2(__m128d); +IMPORT CONST __m128d Sleef_fmad2_sse2(__m128d, __m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_fmad2_sse2(__m128d, __m128d, __m128d); +IMPORT CONST __m128d Sleef_sqrtd2_sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_sqrtd2_sse2(__m128d); +IMPORT CONST __m128d Sleef_sqrtd2_u05sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_sqrtd2_u05sse2(__m128d); +IMPORT CONST __m128d Sleef_sqrtd2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_sqrtd2_u35sse2(__m128d); +IMPORT CONST __m128d Sleef_hypotd2_u05sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_hypotd2_u05sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_hypotd2_u35sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_hypotd2_u35sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_fabsd2_sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_fabsd2_sse2(__m128d); +IMPORT CONST __m128d Sleef_copysignd2_sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_copysignd2_sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_fmaxd2_sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_fmaxd2_sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_fmind2_sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_fmind2_sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_fdimd2_sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_fdimd2_sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_truncd2_sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_truncd2_sse2(__m128d); +IMPORT CONST __m128d Sleef_floord2_sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_floord2_sse2(__m128d); +IMPORT CONST __m128d Sleef_ceild2_sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_ceild2_sse2(__m128d); +IMPORT CONST __m128d Sleef_roundd2_sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_roundd2_sse2(__m128d); +IMPORT CONST __m128d Sleef_rintd2_sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_rintd2_sse2(__m128d); +IMPORT CONST __m128d Sleef_nextafterd2_sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_nextafterd2_sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_frfrexpd2_sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_frfrexpd2_sse2(__m128d); +IMPORT CONST __m128i Sleef_expfrexpd2_sse2(__m128d); +IMPORT CONST __m128i Sleef_cinz_expfrexpd2_sse2(__m128d); +IMPORT CONST __m128d Sleef_fmodd2_sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_fmodd2_sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_remainderd2_sse2(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_remainderd2_sse2(__m128d, __m128d); +IMPORT CONST Sleef___m128d_2 Sleef_modfd2_sse2(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_cinz_modfd2_sse2(__m128d); +IMPORT CONST __m128d Sleef_lgammad2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_lgammad2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_tgammad2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_tgammad2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_erfd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_erfd2_u10sse2(__m128d); +IMPORT CONST __m128d Sleef_erfcd2_u15sse2(__m128d); +IMPORT CONST __m128d Sleef_cinz_erfcd2_u15sse2(__m128d); +IMPORT CONST int Sleef_getIntd2_sse2(int); +IMPORT CONST void *Sleef_getPtrd2_sse2(int); + +#ifndef Sleef___m128_2_DEFINED +typedef struct { + __m128 x, y; +} Sleef___m128_2; +#define Sleef___m128_2_DEFINED +#endif + +IMPORT CONST __m128 Sleef_sinf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_sinf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_cosf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_cosf4_u35sse2(__m128); +IMPORT CONST Sleef___m128_2 Sleef_sincosf4_u35sse2(__m128); +IMPORT CONST Sleef___m128_2 Sleef_cinz_sincosf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_tanf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_tanf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_asinf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_asinf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_acosf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_acosf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_atanf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_atanf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_atan2f4_u35sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_atan2f4_u35sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_logf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_logf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_cbrtf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_cbrtf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_sinf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_sinf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cosf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_cosf4_u10sse2(__m128); +IMPORT CONST Sleef___m128_2 Sleef_sincosf4_u10sse2(__m128); +IMPORT CONST Sleef___m128_2 Sleef_cinz_sincosf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_tanf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_tanf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_asinf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_asinf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_acosf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_acosf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_atanf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_atanf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_atan2f4_u10sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_atan2f4_u10sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_logf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_logf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cbrtf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_cbrtf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_expf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_expf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_powf4_u10sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_powf4_u10sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_sinhf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_sinhf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_coshf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_coshf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_tanhf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_tanhf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_sinhf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_sinhf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_coshf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_coshf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_tanhf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_tanhf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_fastsinf4_u3500sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_fastsinf4_u3500sse2(__m128); +IMPORT CONST __m128 Sleef_fastcosf4_u3500sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_fastcosf4_u3500sse2(__m128); +IMPORT CONST __m128 Sleef_fastpowf4_u3500sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_fastpowf4_u3500sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_asinhf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_asinhf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_acoshf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_acoshf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_atanhf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_atanhf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_exp2f4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_exp2f4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_exp2f4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_exp2f4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_exp10f4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_exp10f4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_exp10f4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_exp10f4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_expm1f4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_expm1f4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_log10f4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_log10f4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_log2f4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_log2f4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_log2f4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_log2f4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_log1pf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_log1pf4_u10sse2(__m128); +IMPORT CONST Sleef___m128_2 Sleef_sincospif4_u05sse2(__m128); +IMPORT CONST Sleef___m128_2 Sleef_cinz_sincospif4_u05sse2(__m128); +IMPORT CONST Sleef___m128_2 Sleef_sincospif4_u35sse2(__m128); +IMPORT CONST Sleef___m128_2 Sleef_cinz_sincospif4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_sinpif4_u05sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_sinpif4_u05sse2(__m128); +IMPORT CONST __m128 Sleef_cospif4_u05sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_cospif4_u05sse2(__m128); +IMPORT CONST __m128 Sleef_fmaf4_sse2(__m128, __m128, __m128); +IMPORT CONST __m128 Sleef_cinz_fmaf4_sse2(__m128, __m128, __m128); +IMPORT CONST __m128 Sleef_sqrtf4_sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_sqrtf4_sse2(__m128); +IMPORT CONST __m128 Sleef_sqrtf4_u05sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_sqrtf4_u05sse2(__m128); +IMPORT CONST __m128 Sleef_sqrtf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_sqrtf4_u35sse2(__m128); +IMPORT CONST __m128 Sleef_hypotf4_u05sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_hypotf4_u05sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_hypotf4_u35sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_hypotf4_u35sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_fabsf4_sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_fabsf4_sse2(__m128); +IMPORT CONST __m128 Sleef_copysignf4_sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_copysignf4_sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_fmaxf4_sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_fmaxf4_sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_fminf4_sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_fminf4_sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_fdimf4_sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_fdimf4_sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_truncf4_sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_truncf4_sse2(__m128); +IMPORT CONST __m128 Sleef_floorf4_sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_floorf4_sse2(__m128); +IMPORT CONST __m128 Sleef_ceilf4_sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_ceilf4_sse2(__m128); +IMPORT CONST __m128 Sleef_roundf4_sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_roundf4_sse2(__m128); +IMPORT CONST __m128 Sleef_rintf4_sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_rintf4_sse2(__m128); +IMPORT CONST __m128 Sleef_nextafterf4_sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_nextafterf4_sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_frfrexpf4_sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_frfrexpf4_sse2(__m128); +IMPORT CONST __m128 Sleef_fmodf4_sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_fmodf4_sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_remainderf4_sse2(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_remainderf4_sse2(__m128, __m128); +IMPORT CONST Sleef___m128_2 Sleef_modff4_sse2(__m128); +IMPORT CONST Sleef___m128_2 Sleef_cinz_modff4_sse2(__m128); +IMPORT CONST __m128 Sleef_lgammaf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_lgammaf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_tgammaf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_tgammaf4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_erff4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_erff4_u10sse2(__m128); +IMPORT CONST __m128 Sleef_erfcf4_u15sse2(__m128); +IMPORT CONST __m128 Sleef_cinz_erfcf4_u15sse2(__m128); +IMPORT CONST int Sleef_getIntf4_sse2(int); +IMPORT CONST int Sleef_cinz_getIntf4_sse2(int); +IMPORT CONST void *Sleef_getPtrf4_sse2(int); +IMPORT CONST void *Sleef_cinz_getPtrf4_sse2(int); +#endif +#ifdef __SSE2__ + +#ifndef Sleef___m128d_2_DEFINED +typedef struct { + __m128d x, y; +} Sleef___m128d_2; +#define Sleef___m128d_2_DEFINED +#endif + +IMPORT CONST __m128d Sleef_sind2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_sind2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_cosd2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_cosd2_u35sse4(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_sincosd2_u35sse4(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincosd2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_tand2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_tand2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_asind2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_asind2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_acosd2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_acosd2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_atand2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_atand2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_atan2d2_u35sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_atan2d2_u35sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_logd2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_logd2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_cbrtd2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_cbrtd2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_sind2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_sind2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cosd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_cosd2_u10sse4(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_sincosd2_u10sse4(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincosd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_tand2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_tand2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_asind2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_asind2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_acosd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_acosd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_atand2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_atand2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_atan2d2_u10sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_atan2d2_u10sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_logd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_logd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cbrtd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_cbrtd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_expd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_expd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_powd2_u10sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_powd2_u10sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_sinhd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_sinhd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_coshd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_coshd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_tanhd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_tanhd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_sinhd2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_sinhd2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_coshd2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_coshd2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_tanhd2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_tanhd2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_fastsind2_u3500sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_fastsind2_u3500sse4(__m128d); +IMPORT CONST __m128d Sleef_fastcosd2_u3500sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_fastcosd2_u3500sse4(__m128d); +IMPORT CONST __m128d Sleef_fastpowd2_u3500sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_fastpowd2_u3500sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_asinhd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_asinhd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_acoshd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_acoshd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_atanhd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_atanhd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_exp2d2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_exp2d2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_exp2d2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_exp2d2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_exp10d2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_exp10d2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_exp10d2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_exp10d2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_expm1d2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_expm1d2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_log10d2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_log10d2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_log2d2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_log2d2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_log2d2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_log2d2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_log1pd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_log1pd2_u10sse4(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_sincospid2_u05sse4(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincospid2_u05sse4(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_sincospid2_u35sse4(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincospid2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_sinpid2_u05sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_sinpid2_u05sse4(__m128d); +IMPORT CONST __m128d Sleef_cospid2_u05sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_cospid2_u05sse4(__m128d); +IMPORT CONST __m128d Sleef_ldexpd2_sse4(__m128d, __m128i); +IMPORT CONST __m128d Sleef_cinz_ldexpd2_sse4(__m128d, __m128i); +IMPORT CONST __m128i Sleef_ilogbd2_sse4(__m128d); +IMPORT CONST __m128i Sleef_cinz_ilogbd2_sse4(__m128d); +IMPORT CONST __m128d Sleef_fmad2_sse4(__m128d, __m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_fmad2_sse4(__m128d, __m128d, __m128d); +IMPORT CONST __m128d Sleef_sqrtd2_sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_sqrtd2_sse4(__m128d); +IMPORT CONST __m128d Sleef_sqrtd2_u05sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_sqrtd2_u05sse4(__m128d); +IMPORT CONST __m128d Sleef_sqrtd2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_sqrtd2_u35sse4(__m128d); +IMPORT CONST __m128d Sleef_hypotd2_u05sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_hypotd2_u05sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_hypotd2_u35sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_hypotd2_u35sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_fabsd2_sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_fabsd2_sse4(__m128d); +IMPORT CONST __m128d Sleef_copysignd2_sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_copysignd2_sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_fmaxd2_sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_fmaxd2_sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_fmind2_sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_fmind2_sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_fdimd2_sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_fdimd2_sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_truncd2_sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_truncd2_sse4(__m128d); +IMPORT CONST __m128d Sleef_floord2_sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_floord2_sse4(__m128d); +IMPORT CONST __m128d Sleef_ceild2_sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_ceild2_sse4(__m128d); +IMPORT CONST __m128d Sleef_roundd2_sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_roundd2_sse4(__m128d); +IMPORT CONST __m128d Sleef_rintd2_sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_rintd2_sse4(__m128d); +IMPORT CONST __m128d Sleef_nextafterd2_sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_nextafterd2_sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_frfrexpd2_sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_frfrexpd2_sse4(__m128d); +IMPORT CONST __m128i Sleef_expfrexpd2_sse4(__m128d); +IMPORT CONST __m128i Sleef_cinz_expfrexpd2_sse4(__m128d); +IMPORT CONST __m128d Sleef_fmodd2_sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_fmodd2_sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_remainderd2_sse4(__m128d, __m128d); +IMPORT CONST __m128d Sleef_cinz_remainderd2_sse4(__m128d, __m128d); +IMPORT CONST Sleef___m128d_2 Sleef_modfd2_sse4(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_cinz_modfd2_sse4(__m128d); +IMPORT CONST __m128d Sleef_lgammad2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_lgammad2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_tgammad2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_tgammad2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_erfd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_erfd2_u10sse4(__m128d); +IMPORT CONST __m128d Sleef_erfcd2_u15sse4(__m128d); +IMPORT CONST __m128d Sleef_cinz_erfcd2_u15sse4(__m128d); +IMPORT CONST int Sleef_getIntd2_sse4(int); +IMPORT CONST void *Sleef_getPtrd2_sse4(int); + +#ifndef Sleef___m128_2_DEFINED +typedef struct { + __m128 x, y; +} Sleef___m128_2; +#define Sleef___m128_2_DEFINED +#endif + +IMPORT CONST __m128 Sleef_sinf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_sinf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_cosf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_cosf4_u35sse4(__m128); +IMPORT CONST Sleef___m128_2 Sleef_sincosf4_u35sse4(__m128); +IMPORT CONST Sleef___m128_2 Sleef_cinz_sincosf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_tanf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_tanf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_asinf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_asinf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_acosf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_acosf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_atanf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_atanf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_atan2f4_u35sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_atan2f4_u35sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_logf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_logf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_cbrtf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_cbrtf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_sinf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_sinf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cosf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_cosf4_u10sse4(__m128); +IMPORT CONST Sleef___m128_2 Sleef_sincosf4_u10sse4(__m128); +IMPORT CONST Sleef___m128_2 Sleef_cinz_sincosf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_tanf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_tanf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_asinf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_asinf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_acosf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_acosf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_atanf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_atanf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_atan2f4_u10sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_atan2f4_u10sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_logf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_logf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cbrtf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_cbrtf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_expf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_expf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_powf4_u10sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_powf4_u10sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_sinhf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_sinhf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_coshf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_coshf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_tanhf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_tanhf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_sinhf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_sinhf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_coshf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_coshf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_tanhf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_tanhf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_fastsinf4_u3500sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_fastsinf4_u3500sse4(__m128); +IMPORT CONST __m128 Sleef_fastcosf4_u3500sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_fastcosf4_u3500sse4(__m128); +IMPORT CONST __m128 Sleef_fastpowf4_u3500sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_fastpowf4_u3500sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_asinhf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_asinhf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_acoshf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_acoshf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_atanhf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_atanhf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_exp2f4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_exp2f4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_exp2f4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_exp2f4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_exp10f4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_exp10f4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_exp10f4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_exp10f4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_expm1f4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_expm1f4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_log10f4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_log10f4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_log2f4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_log2f4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_log2f4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_log2f4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_log1pf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_log1pf4_u10sse4(__m128); +IMPORT CONST Sleef___m128_2 Sleef_sincospif4_u05sse4(__m128); +IMPORT CONST Sleef___m128_2 Sleef_cinz_sincospif4_u05sse4(__m128); +IMPORT CONST Sleef___m128_2 Sleef_sincospif4_u35sse4(__m128); +IMPORT CONST Sleef___m128_2 Sleef_cinz_sincospif4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_sinpif4_u05sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_sinpif4_u05sse4(__m128); +IMPORT CONST __m128 Sleef_cospif4_u05sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_cospif4_u05sse4(__m128); +IMPORT CONST __m128 Sleef_fmaf4_sse4(__m128, __m128, __m128); +IMPORT CONST __m128 Sleef_cinz_fmaf4_sse4(__m128, __m128, __m128); +IMPORT CONST __m128 Sleef_sqrtf4_sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_sqrtf4_sse4(__m128); +IMPORT CONST __m128 Sleef_sqrtf4_u05sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_sqrtf4_u05sse4(__m128); +IMPORT CONST __m128 Sleef_sqrtf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_sqrtf4_u35sse4(__m128); +IMPORT CONST __m128 Sleef_hypotf4_u05sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_hypotf4_u05sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_hypotf4_u35sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_hypotf4_u35sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_fabsf4_sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_fabsf4_sse4(__m128); +IMPORT CONST __m128 Sleef_copysignf4_sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_copysignf4_sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_fmaxf4_sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_fmaxf4_sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_fminf4_sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_fminf4_sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_fdimf4_sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_fdimf4_sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_truncf4_sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_truncf4_sse4(__m128); +IMPORT CONST __m128 Sleef_floorf4_sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_floorf4_sse4(__m128); +IMPORT CONST __m128 Sleef_ceilf4_sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_ceilf4_sse4(__m128); +IMPORT CONST __m128 Sleef_roundf4_sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_roundf4_sse4(__m128); +IMPORT CONST __m128 Sleef_rintf4_sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_rintf4_sse4(__m128); +IMPORT CONST __m128 Sleef_nextafterf4_sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_nextafterf4_sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_frfrexpf4_sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_frfrexpf4_sse4(__m128); +IMPORT CONST __m128 Sleef_fmodf4_sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_fmodf4_sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_remainderf4_sse4(__m128, __m128); +IMPORT CONST __m128 Sleef_cinz_remainderf4_sse4(__m128, __m128); +IMPORT CONST Sleef___m128_2 Sleef_modff4_sse4(__m128); +IMPORT CONST Sleef___m128_2 Sleef_cinz_modff4_sse4(__m128); +IMPORT CONST __m128 Sleef_lgammaf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_lgammaf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_tgammaf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_tgammaf4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_erff4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_erff4_u10sse4(__m128); +IMPORT CONST __m128 Sleef_erfcf4_u15sse4(__m128); +IMPORT CONST __m128 Sleef_cinz_erfcf4_u15sse4(__m128); +IMPORT CONST int Sleef_getIntf4_sse4(int); +IMPORT CONST int Sleef_cinz_getIntf4_sse4(int); +IMPORT CONST void *Sleef_getPtrf4_sse4(int); +IMPORT CONST void *Sleef_cinz_getPtrf4_sse4(int); +#endif +#ifdef __AVX__ + +#ifndef Sleef___m256d_2_DEFINED +typedef struct { + __m256d x, y; +} Sleef___m256d_2; +#define Sleef___m256d_2_DEFINED +#endif + +IMPORT CONST __m256d Sleef_sind4_u35(__m256d); +IMPORT CONST __m256d Sleef_cinz_sind4_u35(__m256d); +IMPORT CONST __m256d Sleef_cosd4_u35(__m256d); +IMPORT CONST __m256d Sleef_cinz_cosd4_u35(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_sincosd4_u35(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_cinz_sincosd4_u35(__m256d); +IMPORT CONST __m256d Sleef_tand4_u35(__m256d); +IMPORT CONST __m256d Sleef_cinz_tand4_u35(__m256d); +IMPORT CONST __m256d Sleef_asind4_u35(__m256d); +IMPORT CONST __m256d Sleef_cinz_asind4_u35(__m256d); +IMPORT CONST __m256d Sleef_acosd4_u35(__m256d); +IMPORT CONST __m256d Sleef_cinz_acosd4_u35(__m256d); +IMPORT CONST __m256d Sleef_atand4_u35(__m256d); +IMPORT CONST __m256d Sleef_cinz_atand4_u35(__m256d); +IMPORT CONST __m256d Sleef_atan2d4_u35(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_atan2d4_u35(__m256d, __m256d); +IMPORT CONST __m256d Sleef_logd4_u35(__m256d); +IMPORT CONST __m256d Sleef_cinz_logd4_u35(__m256d); +IMPORT CONST __m256d Sleef_cbrtd4_u35(__m256d); +IMPORT CONST __m256d Sleef_cinz_cbrtd4_u35(__m256d); +IMPORT CONST __m256d Sleef_sind4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_sind4_u10(__m256d); +IMPORT CONST __m256d Sleef_cosd4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_cosd4_u10(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_sincosd4_u10(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_cinz_sincosd4_u10(__m256d); +IMPORT CONST __m256d Sleef_tand4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_tand4_u10(__m256d); +IMPORT CONST __m256d Sleef_asind4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_asind4_u10(__m256d); +IMPORT CONST __m256d Sleef_acosd4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_acosd4_u10(__m256d); +IMPORT CONST __m256d Sleef_atand4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_atand4_u10(__m256d); +IMPORT CONST __m256d Sleef_atan2d4_u10(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_atan2d4_u10(__m256d, __m256d); +IMPORT CONST __m256d Sleef_logd4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_logd4_u10(__m256d); +IMPORT CONST __m256d Sleef_cbrtd4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_cbrtd4_u10(__m256d); +IMPORT CONST __m256d Sleef_expd4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_expd4_u10(__m256d); +IMPORT CONST __m256d Sleef_powd4_u10(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_powd4_u10(__m256d, __m256d); +IMPORT CONST __m256d Sleef_sinhd4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_sinhd4_u10(__m256d); +IMPORT CONST __m256d Sleef_coshd4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_coshd4_u10(__m256d); +IMPORT CONST __m256d Sleef_tanhd4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_tanhd4_u10(__m256d); +IMPORT CONST __m256d Sleef_sinhd4_u35(__m256d); +IMPORT CONST __m256d Sleef_cinz_sinhd4_u35(__m256d); +IMPORT CONST __m256d Sleef_coshd4_u35(__m256d); +IMPORT CONST __m256d Sleef_cinz_coshd4_u35(__m256d); +IMPORT CONST __m256d Sleef_tanhd4_u35(__m256d); +IMPORT CONST __m256d Sleef_cinz_tanhd4_u35(__m256d); +IMPORT CONST __m256d Sleef_fastsind4_u3500(__m256d); +IMPORT CONST __m256d Sleef_cinz_fastsind4_u3500(__m256d); +IMPORT CONST __m256d Sleef_fastcosd4_u3500(__m256d); +IMPORT CONST __m256d Sleef_cinz_fastcosd4_u3500(__m256d); +IMPORT CONST __m256d Sleef_fastpowd4_u3500(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_fastpowd4_u3500(__m256d, __m256d); +IMPORT CONST __m256d Sleef_asinhd4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_asinhd4_u10(__m256d); +IMPORT CONST __m256d Sleef_acoshd4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_acoshd4_u10(__m256d); +IMPORT CONST __m256d Sleef_atanhd4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_atanhd4_u10(__m256d); +IMPORT CONST __m256d Sleef_exp2d4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_exp2d4_u10(__m256d); +IMPORT CONST __m256d Sleef_exp2d4_u35(__m256d); +IMPORT CONST __m256d Sleef_cinz_exp2d4_u35(__m256d); +IMPORT CONST __m256d Sleef_exp10d4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_exp10d4_u10(__m256d); +IMPORT CONST __m256d Sleef_exp10d4_u35(__m256d); +IMPORT CONST __m256d Sleef_cinz_exp10d4_u35(__m256d); +IMPORT CONST __m256d Sleef_expm1d4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_expm1d4_u10(__m256d); +IMPORT CONST __m256d Sleef_log10d4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_log10d4_u10(__m256d); +IMPORT CONST __m256d Sleef_log2d4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_log2d4_u10(__m256d); +IMPORT CONST __m256d Sleef_log2d4_u35(__m256d); +IMPORT CONST __m256d Sleef_cinz_log2d4_u35(__m256d); +IMPORT CONST __m256d Sleef_log1pd4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_log1pd4_u10(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_sincospid4_u05(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_cinz_sincospid4_u05(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_sincospid4_u35(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_cinz_sincospid4_u35(__m256d); +IMPORT CONST __m256d Sleef_sinpid4_u05(__m256d); +IMPORT CONST __m256d Sleef_cinz_sinpid4_u05(__m256d); +IMPORT CONST __m256d Sleef_cospid4_u05(__m256d); +IMPORT CONST __m256d Sleef_cinz_cospid4_u05(__m256d); +IMPORT CONST __m256d Sleef_ldexpd4(__m256d, __m128i); +IMPORT CONST __m256d Sleef_cinz_ldexpd4(__m256d, __m128i); +IMPORT CONST __m128i Sleef_ilogbd4(__m256d); +IMPORT CONST __m128i Sleef_cinz_ilogbd4(__m256d); +IMPORT CONST __m256d Sleef_fmad4(__m256d, __m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_fmad4(__m256d, __m256d, __m256d); +IMPORT CONST __m256d Sleef_sqrtd4(__m256d); +IMPORT CONST __m256d Sleef_cinz_sqrtd4(__m256d); +IMPORT CONST __m256d Sleef_sqrtd4_u05(__m256d); +IMPORT CONST __m256d Sleef_cinz_sqrtd4_u05(__m256d); +IMPORT CONST __m256d Sleef_sqrtd4_u35(__m256d); +IMPORT CONST __m256d Sleef_cinz_sqrtd4_u35(__m256d); +IMPORT CONST __m256d Sleef_hypotd4_u05(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_hypotd4_u05(__m256d, __m256d); +IMPORT CONST __m256d Sleef_hypotd4_u35(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_hypotd4_u35(__m256d, __m256d); +IMPORT CONST __m256d Sleef_fabsd4(__m256d); +IMPORT CONST __m256d Sleef_cinz_fabsd4(__m256d); +IMPORT CONST __m256d Sleef_copysignd4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_copysignd4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_fmaxd4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_fmaxd4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_fmind4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_fmind4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_fdimd4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_fdimd4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_truncd4(__m256d); +IMPORT CONST __m256d Sleef_cinz_truncd4(__m256d); +IMPORT CONST __m256d Sleef_floord4(__m256d); +IMPORT CONST __m256d Sleef_cinz_floord4(__m256d); +IMPORT CONST __m256d Sleef_ceild4(__m256d); +IMPORT CONST __m256d Sleef_cinz_ceild4(__m256d); +IMPORT CONST __m256d Sleef_roundd4(__m256d); +IMPORT CONST __m256d Sleef_cinz_roundd4(__m256d); +IMPORT CONST __m256d Sleef_rintd4(__m256d); +IMPORT CONST __m256d Sleef_cinz_rintd4(__m256d); +IMPORT CONST __m256d Sleef_nextafterd4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_nextafterd4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_frfrexpd4(__m256d); +IMPORT CONST __m256d Sleef_cinz_frfrexpd4(__m256d); +IMPORT CONST __m128i Sleef_expfrexpd4(__m256d); +IMPORT CONST __m128i Sleef_cinz_expfrexpd4(__m256d); +IMPORT CONST __m256d Sleef_fmodd4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_fmodd4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_remainderd4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_remainderd4(__m256d, __m256d); +IMPORT CONST Sleef___m256d_2 Sleef_modfd4(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_cinz_modfd4(__m256d); +IMPORT CONST __m256d Sleef_lgammad4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_lgammad4_u10(__m256d); +IMPORT CONST __m256d Sleef_tgammad4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_tgammad4_u10(__m256d); +IMPORT CONST __m256d Sleef_erfd4_u10(__m256d); +IMPORT CONST __m256d Sleef_cinz_erfd4_u10(__m256d); +IMPORT CONST __m256d Sleef_erfcd4_u15(__m256d); +IMPORT CONST __m256d Sleef_cinz_erfcd4_u15(__m256d); +IMPORT CONST int Sleef_getIntd4(int); +IMPORT CONST void *Sleef_getPtrd4(int); + +#ifndef Sleef___m256_2_DEFINED +typedef struct { + __m256 x, y; +} Sleef___m256_2; +#define Sleef___m256_2_DEFINED +#endif + +IMPORT CONST __m256 Sleef_sinf8_u35(__m256); +IMPORT CONST __m256 Sleef_cinz_sinf8_u35(__m256); +IMPORT CONST __m256 Sleef_cosf8_u35(__m256); +IMPORT CONST __m256 Sleef_cinz_cosf8_u35(__m256); +IMPORT CONST Sleef___m256_2 Sleef_sincosf8_u35(__m256); +IMPORT CONST Sleef___m256_2 Sleef_cinz_sincosf8_u35(__m256); +IMPORT CONST __m256 Sleef_tanf8_u35(__m256); +IMPORT CONST __m256 Sleef_cinz_tanf8_u35(__m256); +IMPORT CONST __m256 Sleef_asinf8_u35(__m256); +IMPORT CONST __m256 Sleef_cinz_asinf8_u35(__m256); +IMPORT CONST __m256 Sleef_acosf8_u35(__m256); +IMPORT CONST __m256 Sleef_cinz_acosf8_u35(__m256); +IMPORT CONST __m256 Sleef_atanf8_u35(__m256); +IMPORT CONST __m256 Sleef_cinz_atanf8_u35(__m256); +IMPORT CONST __m256 Sleef_atan2f8_u35(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_atan2f8_u35(__m256, __m256); +IMPORT CONST __m256 Sleef_logf8_u35(__m256); +IMPORT CONST __m256 Sleef_cinz_logf8_u35(__m256); +IMPORT CONST __m256 Sleef_cbrtf8_u35(__m256); +IMPORT CONST __m256 Sleef_cinz_cbrtf8_u35(__m256); +IMPORT CONST __m256 Sleef_sinf8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_sinf8_u10(__m256); +IMPORT CONST __m256 Sleef_cosf8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_cosf8_u10(__m256); +IMPORT CONST Sleef___m256_2 Sleef_sincosf8_u10(__m256); +IMPORT CONST Sleef___m256_2 Sleef_cinz_sincosf8_u10(__m256); +IMPORT CONST __m256 Sleef_tanf8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_tanf8_u10(__m256); +IMPORT CONST __m256 Sleef_asinf8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_asinf8_u10(__m256); +IMPORT CONST __m256 Sleef_acosf8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_acosf8_u10(__m256); +IMPORT CONST __m256 Sleef_atanf8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_atanf8_u10(__m256); +IMPORT CONST __m256 Sleef_atan2f8_u10(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_atan2f8_u10(__m256, __m256); +IMPORT CONST __m256 Sleef_logf8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_logf8_u10(__m256); +IMPORT CONST __m256 Sleef_cbrtf8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_cbrtf8_u10(__m256); +IMPORT CONST __m256 Sleef_expf8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_expf8_u10(__m256); +IMPORT CONST __m256 Sleef_powf8_u10(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_powf8_u10(__m256, __m256); +IMPORT CONST __m256 Sleef_sinhf8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_sinhf8_u10(__m256); +IMPORT CONST __m256 Sleef_coshf8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_coshf8_u10(__m256); +IMPORT CONST __m256 Sleef_tanhf8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_tanhf8_u10(__m256); +IMPORT CONST __m256 Sleef_sinhf8_u35(__m256); +IMPORT CONST __m256 Sleef_cinz_sinhf8_u35(__m256); +IMPORT CONST __m256 Sleef_coshf8_u35(__m256); +IMPORT CONST __m256 Sleef_cinz_coshf8_u35(__m256); +IMPORT CONST __m256 Sleef_tanhf8_u35(__m256); +IMPORT CONST __m256 Sleef_cinz_tanhf8_u35(__m256); +IMPORT CONST __m256 Sleef_fastsinf8_u3500(__m256); +IMPORT CONST __m256 Sleef_cinz_fastsinf8_u3500(__m256); +IMPORT CONST __m256 Sleef_fastcosf8_u3500(__m256); +IMPORT CONST __m256 Sleef_cinz_fastcosf8_u3500(__m256); +IMPORT CONST __m256 Sleef_fastpowf8_u3500(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_fastpowf8_u3500(__m256, __m256); +IMPORT CONST __m256 Sleef_asinhf8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_asinhf8_u10(__m256); +IMPORT CONST __m256 Sleef_acoshf8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_acoshf8_u10(__m256); +IMPORT CONST __m256 Sleef_atanhf8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_atanhf8_u10(__m256); +IMPORT CONST __m256 Sleef_exp2f8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_exp2f8_u10(__m256); +IMPORT CONST __m256 Sleef_exp2f8_u35(__m256); +IMPORT CONST __m256 Sleef_cinz_exp2f8_u35(__m256); +IMPORT CONST __m256 Sleef_exp10f8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_exp10f8_u10(__m256); +IMPORT CONST __m256 Sleef_exp10f8_u35(__m256); +IMPORT CONST __m256 Sleef_cinz_exp10f8_u35(__m256); +IMPORT CONST __m256 Sleef_expm1f8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_expm1f8_u10(__m256); +IMPORT CONST __m256 Sleef_log10f8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_log10f8_u10(__m256); +IMPORT CONST __m256 Sleef_log2f8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_log2f8_u10(__m256); +IMPORT CONST __m256 Sleef_log2f8_u35(__m256); +IMPORT CONST __m256 Sleef_cinz_log2f8_u35(__m256); +IMPORT CONST __m256 Sleef_log1pf8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_log1pf8_u10(__m256); +IMPORT CONST Sleef___m256_2 Sleef_sincospif8_u05(__m256); +IMPORT CONST Sleef___m256_2 Sleef_cinz_sincospif8_u05(__m256); +IMPORT CONST Sleef___m256_2 Sleef_sincospif8_u35(__m256); +IMPORT CONST Sleef___m256_2 Sleef_cinz_sincospif8_u35(__m256); +IMPORT CONST __m256 Sleef_sinpif8_u05(__m256); +IMPORT CONST __m256 Sleef_cinz_sinpif8_u05(__m256); +IMPORT CONST __m256 Sleef_cospif8_u05(__m256); +IMPORT CONST __m256 Sleef_cinz_cospif8_u05(__m256); +IMPORT CONST __m256 Sleef_fmaf8(__m256, __m256, __m256); +IMPORT CONST __m256 Sleef_cinz_fmaf8(__m256, __m256, __m256); +IMPORT CONST __m256 Sleef_sqrtf8(__m256); +IMPORT CONST __m256 Sleef_cinz_sqrtf8(__m256); +IMPORT CONST __m256 Sleef_sqrtf8_u05(__m256); +IMPORT CONST __m256 Sleef_cinz_sqrtf8_u05(__m256); +IMPORT CONST __m256 Sleef_sqrtf8_u35(__m256); +IMPORT CONST __m256 Sleef_cinz_sqrtf8_u35(__m256); +IMPORT CONST __m256 Sleef_hypotf8_u05(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_hypotf8_u05(__m256, __m256); +IMPORT CONST __m256 Sleef_hypotf8_u35(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_hypotf8_u35(__m256, __m256); +IMPORT CONST __m256 Sleef_fabsf8(__m256); +IMPORT CONST __m256 Sleef_cinz_fabsf8(__m256); +IMPORT CONST __m256 Sleef_copysignf8(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_copysignf8(__m256, __m256); +IMPORT CONST __m256 Sleef_fmaxf8(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_fmaxf8(__m256, __m256); +IMPORT CONST __m256 Sleef_fminf8(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_fminf8(__m256, __m256); +IMPORT CONST __m256 Sleef_fdimf8(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_fdimf8(__m256, __m256); +IMPORT CONST __m256 Sleef_truncf8(__m256); +IMPORT CONST __m256 Sleef_cinz_truncf8(__m256); +IMPORT CONST __m256 Sleef_floorf8(__m256); +IMPORT CONST __m256 Sleef_cinz_floorf8(__m256); +IMPORT CONST __m256 Sleef_ceilf8(__m256); +IMPORT CONST __m256 Sleef_cinz_ceilf8(__m256); +IMPORT CONST __m256 Sleef_roundf8(__m256); +IMPORT CONST __m256 Sleef_cinz_roundf8(__m256); +IMPORT CONST __m256 Sleef_rintf8(__m256); +IMPORT CONST __m256 Sleef_cinz_rintf8(__m256); +IMPORT CONST __m256 Sleef_nextafterf8(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_nextafterf8(__m256, __m256); +IMPORT CONST __m256 Sleef_frfrexpf8(__m256); +IMPORT CONST __m256 Sleef_cinz_frfrexpf8(__m256); +IMPORT CONST __m256 Sleef_fmodf8(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_fmodf8(__m256, __m256); +IMPORT CONST __m256 Sleef_remainderf8(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_remainderf8(__m256, __m256); +IMPORT CONST Sleef___m256_2 Sleef_modff8(__m256); +IMPORT CONST Sleef___m256_2 Sleef_cinz_modff8(__m256); +IMPORT CONST __m256 Sleef_lgammaf8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_lgammaf8_u10(__m256); +IMPORT CONST __m256 Sleef_tgammaf8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_tgammaf8_u10(__m256); +IMPORT CONST __m256 Sleef_erff8_u10(__m256); +IMPORT CONST __m256 Sleef_cinz_erff8_u10(__m256); +IMPORT CONST __m256 Sleef_erfcf8_u15(__m256); +IMPORT CONST __m256 Sleef_cinz_erfcf8_u15(__m256); +IMPORT CONST int Sleef_getIntf8(int); +IMPORT CONST int Sleef_cinz_getIntf8(int); +IMPORT CONST void *Sleef_getPtrf8(int); +IMPORT CONST void *Sleef_cinz_getPtrf8(int); +#endif +#ifdef __AVX__ + +#ifndef Sleef___m256d_2_DEFINED +typedef struct { + __m256d x, y; +} Sleef___m256d_2; +#define Sleef___m256d_2_DEFINED +#endif + +IMPORT CONST __m256d Sleef_sind4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_sind4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_cosd4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_cosd4_u35avx(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_sincosd4_u35avx(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_cinz_sincosd4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_tand4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_tand4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_asind4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_asind4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_acosd4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_acosd4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_atand4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_atand4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_atan2d4_u35avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_atan2d4_u35avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_logd4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_logd4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_cbrtd4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_cbrtd4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_sind4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_sind4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cosd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_cosd4_u10avx(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_sincosd4_u10avx(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_cinz_sincosd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_tand4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_tand4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_asind4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_asind4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_acosd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_acosd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_atand4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_atand4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_atan2d4_u10avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_atan2d4_u10avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_logd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_logd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cbrtd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_cbrtd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_expd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_expd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_powd4_u10avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_powd4_u10avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_sinhd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_sinhd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_coshd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_coshd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_tanhd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_tanhd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_sinhd4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_sinhd4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_coshd4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_coshd4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_tanhd4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_tanhd4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_fastsind4_u3500avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_fastsind4_u3500avx(__m256d); +IMPORT CONST __m256d Sleef_fastcosd4_u3500avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_fastcosd4_u3500avx(__m256d); +IMPORT CONST __m256d Sleef_fastpowd4_u3500avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_fastpowd4_u3500avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_asinhd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_asinhd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_acoshd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_acoshd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_atanhd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_atanhd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_exp2d4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_exp2d4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_exp2d4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_exp2d4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_exp10d4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_exp10d4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_exp10d4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_exp10d4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_expm1d4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_expm1d4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_log10d4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_log10d4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_log2d4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_log2d4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_log2d4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_log2d4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_log1pd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_log1pd4_u10avx(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_sincospid4_u05avx(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_cinz_sincospid4_u05avx(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_sincospid4_u35avx(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_cinz_sincospid4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_sinpid4_u05avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_sinpid4_u05avx(__m256d); +IMPORT CONST __m256d Sleef_cospid4_u05avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_cospid4_u05avx(__m256d); +IMPORT CONST __m256d Sleef_ldexpd4_avx(__m256d, __m128i); +IMPORT CONST __m256d Sleef_cinz_ldexpd4_avx(__m256d, __m128i); +IMPORT CONST __m128i Sleef_ilogbd4_avx(__m256d); +IMPORT CONST __m128i Sleef_cinz_ilogbd4_avx(__m256d); +IMPORT CONST __m256d Sleef_fmad4_avx(__m256d, __m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_fmad4_avx(__m256d, __m256d, __m256d); +IMPORT CONST __m256d Sleef_sqrtd4_avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_sqrtd4_avx(__m256d); +IMPORT CONST __m256d Sleef_sqrtd4_u05avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_sqrtd4_u05avx(__m256d); +IMPORT CONST __m256d Sleef_sqrtd4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_sqrtd4_u35avx(__m256d); +IMPORT CONST __m256d Sleef_hypotd4_u05avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_hypotd4_u05avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_hypotd4_u35avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_hypotd4_u35avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_fabsd4_avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_fabsd4_avx(__m256d); +IMPORT CONST __m256d Sleef_copysignd4_avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_copysignd4_avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_fmaxd4_avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_fmaxd4_avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_fmind4_avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_fmind4_avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_fdimd4_avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_fdimd4_avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_truncd4_avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_truncd4_avx(__m256d); +IMPORT CONST __m256d Sleef_floord4_avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_floord4_avx(__m256d); +IMPORT CONST __m256d Sleef_ceild4_avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_ceild4_avx(__m256d); +IMPORT CONST __m256d Sleef_roundd4_avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_roundd4_avx(__m256d); +IMPORT CONST __m256d Sleef_rintd4_avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_rintd4_avx(__m256d); +IMPORT CONST __m256d Sleef_nextafterd4_avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_nextafterd4_avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_frfrexpd4_avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_frfrexpd4_avx(__m256d); +IMPORT CONST __m128i Sleef_expfrexpd4_avx(__m256d); +IMPORT CONST __m128i Sleef_cinz_expfrexpd4_avx(__m256d); +IMPORT CONST __m256d Sleef_fmodd4_avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_fmodd4_avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_remainderd4_avx(__m256d, __m256d); +IMPORT CONST __m256d Sleef_cinz_remainderd4_avx(__m256d, __m256d); +IMPORT CONST Sleef___m256d_2 Sleef_modfd4_avx(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_cinz_modfd4_avx(__m256d); +IMPORT CONST __m256d Sleef_lgammad4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_lgammad4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_tgammad4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_tgammad4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_erfd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_erfd4_u10avx(__m256d); +IMPORT CONST __m256d Sleef_erfcd4_u15avx(__m256d); +IMPORT CONST __m256d Sleef_cinz_erfcd4_u15avx(__m256d); +IMPORT CONST int Sleef_getIntd4_avx(int); +IMPORT CONST void *Sleef_getPtrd4_avx(int); + +#ifndef Sleef___m256_2_DEFINED +typedef struct { + __m256 x, y; +} Sleef___m256_2; +#define Sleef___m256_2_DEFINED +#endif + +IMPORT CONST __m256 Sleef_sinf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_cinz_sinf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_cosf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_cinz_cosf8_u35avx(__m256); +IMPORT CONST Sleef___m256_2 Sleef_sincosf8_u35avx(__m256); +IMPORT CONST Sleef___m256_2 Sleef_cinz_sincosf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_tanf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_cinz_tanf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_asinf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_cinz_asinf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_acosf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_cinz_acosf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_atanf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_cinz_atanf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_atan2f8_u35avx(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_atan2f8_u35avx(__m256, __m256); +IMPORT CONST __m256 Sleef_logf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_cinz_logf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_cbrtf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_cinz_cbrtf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_sinf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_sinf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cosf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_cosf8_u10avx(__m256); +IMPORT CONST Sleef___m256_2 Sleef_sincosf8_u10avx(__m256); +IMPORT CONST Sleef___m256_2 Sleef_cinz_sincosf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_tanf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_tanf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_asinf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_asinf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_acosf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_acosf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_atanf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_atanf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_atan2f8_u10avx(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_atan2f8_u10avx(__m256, __m256); +IMPORT CONST __m256 Sleef_logf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_logf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cbrtf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_cbrtf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_expf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_expf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_powf8_u10avx(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_powf8_u10avx(__m256, __m256); +IMPORT CONST __m256 Sleef_sinhf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_sinhf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_coshf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_coshf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_tanhf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_tanhf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_sinhf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_cinz_sinhf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_coshf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_cinz_coshf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_tanhf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_cinz_tanhf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_fastsinf8_u3500avx(__m256); +IMPORT CONST __m256 Sleef_cinz_fastsinf8_u3500avx(__m256); +IMPORT CONST __m256 Sleef_fastcosf8_u3500avx(__m256); +IMPORT CONST __m256 Sleef_cinz_fastcosf8_u3500avx(__m256); +IMPORT CONST __m256 Sleef_fastpowf8_u3500avx(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_fastpowf8_u3500avx(__m256, __m256); +IMPORT CONST __m256 Sleef_asinhf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_asinhf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_acoshf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_acoshf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_atanhf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_atanhf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_exp2f8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_exp2f8_u10avx(__m256); +IMPORT CONST __m256 Sleef_exp2f8_u35avx(__m256); +IMPORT CONST __m256 Sleef_cinz_exp2f8_u35avx(__m256); +IMPORT CONST __m256 Sleef_exp10f8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_exp10f8_u10avx(__m256); +IMPORT CONST __m256 Sleef_exp10f8_u35avx(__m256); +IMPORT CONST __m256 Sleef_cinz_exp10f8_u35avx(__m256); +IMPORT CONST __m256 Sleef_expm1f8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_expm1f8_u10avx(__m256); +IMPORT CONST __m256 Sleef_log10f8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_log10f8_u10avx(__m256); +IMPORT CONST __m256 Sleef_log2f8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_log2f8_u10avx(__m256); +IMPORT CONST __m256 Sleef_log2f8_u35avx(__m256); +IMPORT CONST __m256 Sleef_cinz_log2f8_u35avx(__m256); +IMPORT CONST __m256 Sleef_log1pf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_log1pf8_u10avx(__m256); +IMPORT CONST Sleef___m256_2 Sleef_sincospif8_u05avx(__m256); +IMPORT CONST Sleef___m256_2 Sleef_cinz_sincospif8_u05avx(__m256); +IMPORT CONST Sleef___m256_2 Sleef_sincospif8_u35avx(__m256); +IMPORT CONST Sleef___m256_2 Sleef_cinz_sincospif8_u35avx(__m256); +IMPORT CONST __m256 Sleef_sinpif8_u05avx(__m256); +IMPORT CONST __m256 Sleef_cinz_sinpif8_u05avx(__m256); +IMPORT CONST __m256 Sleef_cospif8_u05avx(__m256); +IMPORT CONST __m256 Sleef_cinz_cospif8_u05avx(__m256); +IMPORT CONST __m256 Sleef_fmaf8_avx(__m256, __m256, __m256); +IMPORT CONST __m256 Sleef_cinz_fmaf8_avx(__m256, __m256, __m256); +IMPORT CONST __m256 Sleef_sqrtf8_avx(__m256); +IMPORT CONST __m256 Sleef_cinz_sqrtf8_avx(__m256); +IMPORT CONST __m256 Sleef_sqrtf8_u05avx(__m256); +IMPORT CONST __m256 Sleef_cinz_sqrtf8_u05avx(__m256); +IMPORT CONST __m256 Sleef_sqrtf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_cinz_sqrtf8_u35avx(__m256); +IMPORT CONST __m256 Sleef_hypotf8_u05avx(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_hypotf8_u05avx(__m256, __m256); +IMPORT CONST __m256 Sleef_hypotf8_u35avx(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_hypotf8_u35avx(__m256, __m256); +IMPORT CONST __m256 Sleef_fabsf8_avx(__m256); +IMPORT CONST __m256 Sleef_cinz_fabsf8_avx(__m256); +IMPORT CONST __m256 Sleef_copysignf8_avx(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_copysignf8_avx(__m256, __m256); +IMPORT CONST __m256 Sleef_fmaxf8_avx(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_fmaxf8_avx(__m256, __m256); +IMPORT CONST __m256 Sleef_fminf8_avx(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_fminf8_avx(__m256, __m256); +IMPORT CONST __m256 Sleef_fdimf8_avx(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_fdimf8_avx(__m256, __m256); +IMPORT CONST __m256 Sleef_truncf8_avx(__m256); +IMPORT CONST __m256 Sleef_cinz_truncf8_avx(__m256); +IMPORT CONST __m256 Sleef_floorf8_avx(__m256); +IMPORT CONST __m256 Sleef_cinz_floorf8_avx(__m256); +IMPORT CONST __m256 Sleef_ceilf8_avx(__m256); +IMPORT CONST __m256 Sleef_cinz_ceilf8_avx(__m256); +IMPORT CONST __m256 Sleef_roundf8_avx(__m256); +IMPORT CONST __m256 Sleef_cinz_roundf8_avx(__m256); +IMPORT CONST __m256 Sleef_rintf8_avx(__m256); +IMPORT CONST __m256 Sleef_cinz_rintf8_avx(__m256); +IMPORT CONST __m256 Sleef_nextafterf8_avx(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_nextafterf8_avx(__m256, __m256); +IMPORT CONST __m256 Sleef_frfrexpf8_avx(__m256); +IMPORT CONST __m256 Sleef_cinz_frfrexpf8_avx(__m256); +IMPORT CONST __m256 Sleef_fmodf8_avx(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_fmodf8_avx(__m256, __m256); +IMPORT CONST __m256 Sleef_remainderf8_avx(__m256, __m256); +IMPORT CONST __m256 Sleef_cinz_remainderf8_avx(__m256, __m256); +IMPORT CONST Sleef___m256_2 Sleef_modff8_avx(__m256); +IMPORT CONST Sleef___m256_2 Sleef_cinz_modff8_avx(__m256); +IMPORT CONST __m256 Sleef_lgammaf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_lgammaf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_tgammaf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_tgammaf8_u10avx(__m256); +IMPORT CONST __m256 Sleef_erff8_u10avx(__m256); +IMPORT CONST __m256 Sleef_cinz_erff8_u10avx(__m256); +IMPORT CONST __m256 Sleef_erfcf8_u15avx(__m256); +IMPORT CONST __m256 Sleef_cinz_erfcf8_u15avx(__m256); +IMPORT CONST int Sleef_getIntf8_avx(int); +IMPORT CONST int Sleef_cinz_getIntf8_avx(int); +IMPORT CONST void *Sleef_getPtrf8_avx(int); +IMPORT CONST void *Sleef_cinz_getPtrf8_avx(int); +#endif +#ifdef __AVX__ + +#ifndef Sleef___m256d_2_DEFINED +typedef struct { + __m256d x, y; +} Sleef___m256d_2; +#define Sleef___m256d_2_DEFINED +#endif + +IMPORT CONST __m256d Sleef_sind4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_sind4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_cosd4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_cosd4_u35fma4(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_sincosd4_u35fma4(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_finz_sincosd4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_tand4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_tand4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_asind4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_asind4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_acosd4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_acosd4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_atand4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_atand4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_atan2d4_u35fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_atan2d4_u35fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_logd4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_logd4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_cbrtd4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_cbrtd4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_sind4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_sind4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_cosd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_cosd4_u10fma4(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_sincosd4_u10fma4(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_finz_sincosd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_tand4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_tand4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_asind4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_asind4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_acosd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_acosd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_atand4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_atand4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_atan2d4_u10fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_atan2d4_u10fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_logd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_logd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_cbrtd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_cbrtd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_expd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_expd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_powd4_u10fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_powd4_u10fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_sinhd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_sinhd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_coshd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_coshd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_tanhd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_tanhd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_sinhd4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_sinhd4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_coshd4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_coshd4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_tanhd4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_tanhd4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_fastsind4_u3500fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_fastsind4_u3500fma4(__m256d); +IMPORT CONST __m256d Sleef_fastcosd4_u3500fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_fastcosd4_u3500fma4(__m256d); +IMPORT CONST __m256d Sleef_fastpowd4_u3500fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_fastpowd4_u3500fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_asinhd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_asinhd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_acoshd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_acoshd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_atanhd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_atanhd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_exp2d4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_exp2d4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_exp2d4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_exp2d4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_exp10d4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_exp10d4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_exp10d4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_exp10d4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_expm1d4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_expm1d4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_log10d4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_log10d4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_log2d4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_log2d4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_log2d4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_log2d4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_log1pd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_log1pd4_u10fma4(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_sincospid4_u05fma4(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_finz_sincospid4_u05fma4(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_sincospid4_u35fma4(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_finz_sincospid4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_sinpid4_u05fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_sinpid4_u05fma4(__m256d); +IMPORT CONST __m256d Sleef_cospid4_u05fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_cospid4_u05fma4(__m256d); +IMPORT CONST __m256d Sleef_ldexpd4_fma4(__m256d, __m128i); +IMPORT CONST __m256d Sleef_finz_ldexpd4_fma4(__m256d, __m128i); +IMPORT CONST __m128i Sleef_ilogbd4_fma4(__m256d); +IMPORT CONST __m128i Sleef_finz_ilogbd4_fma4(__m256d); +IMPORT CONST __m256d Sleef_fmad4_fma4(__m256d, __m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_fmad4_fma4(__m256d, __m256d, __m256d); +IMPORT CONST __m256d Sleef_sqrtd4_fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_sqrtd4_fma4(__m256d); +IMPORT CONST __m256d Sleef_sqrtd4_u05fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_sqrtd4_u05fma4(__m256d); +IMPORT CONST __m256d Sleef_sqrtd4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_sqrtd4_u35fma4(__m256d); +IMPORT CONST __m256d Sleef_hypotd4_u05fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_hypotd4_u05fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_hypotd4_u35fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_hypotd4_u35fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_fabsd4_fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_fabsd4_fma4(__m256d); +IMPORT CONST __m256d Sleef_copysignd4_fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_copysignd4_fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_fmaxd4_fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_fmaxd4_fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_fmind4_fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_fmind4_fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_fdimd4_fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_fdimd4_fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_truncd4_fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_truncd4_fma4(__m256d); +IMPORT CONST __m256d Sleef_floord4_fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_floord4_fma4(__m256d); +IMPORT CONST __m256d Sleef_ceild4_fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_ceild4_fma4(__m256d); +IMPORT CONST __m256d Sleef_roundd4_fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_roundd4_fma4(__m256d); +IMPORT CONST __m256d Sleef_rintd4_fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_rintd4_fma4(__m256d); +IMPORT CONST __m256d Sleef_nextafterd4_fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_nextafterd4_fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_frfrexpd4_fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_frfrexpd4_fma4(__m256d); +IMPORT CONST __m128i Sleef_expfrexpd4_fma4(__m256d); +IMPORT CONST __m128i Sleef_finz_expfrexpd4_fma4(__m256d); +IMPORT CONST __m256d Sleef_fmodd4_fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_fmodd4_fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_remainderd4_fma4(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_remainderd4_fma4(__m256d, __m256d); +IMPORT CONST Sleef___m256d_2 Sleef_modfd4_fma4(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_finz_modfd4_fma4(__m256d); +IMPORT CONST __m256d Sleef_lgammad4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_lgammad4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_tgammad4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_tgammad4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_erfd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_erfd4_u10fma4(__m256d); +IMPORT CONST __m256d Sleef_erfcd4_u15fma4(__m256d); +IMPORT CONST __m256d Sleef_finz_erfcd4_u15fma4(__m256d); +IMPORT CONST int Sleef_getIntd4_fma4(int); +IMPORT CONST void *Sleef_getPtrd4_fma4(int); + +#ifndef Sleef___m256_2_DEFINED +typedef struct { + __m256 x, y; +} Sleef___m256_2; +#define Sleef___m256_2_DEFINED +#endif + +IMPORT CONST __m256 Sleef_sinf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_finz_sinf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_cosf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_finz_cosf8_u35fma4(__m256); +IMPORT CONST Sleef___m256_2 Sleef_sincosf8_u35fma4(__m256); +IMPORT CONST Sleef___m256_2 Sleef_finz_sincosf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_tanf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_finz_tanf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_asinf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_finz_asinf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_acosf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_finz_acosf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_atanf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_finz_atanf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_atan2f8_u35fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_atan2f8_u35fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_logf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_finz_logf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_cbrtf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_finz_cbrtf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_sinf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_sinf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_cosf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_cosf8_u10fma4(__m256); +IMPORT CONST Sleef___m256_2 Sleef_sincosf8_u10fma4(__m256); +IMPORT CONST Sleef___m256_2 Sleef_finz_sincosf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_tanf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_tanf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_asinf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_asinf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_acosf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_acosf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_atanf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_atanf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_atan2f8_u10fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_atan2f8_u10fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_logf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_logf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_cbrtf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_cbrtf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_expf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_expf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_powf8_u10fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_powf8_u10fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_sinhf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_sinhf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_coshf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_coshf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_tanhf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_tanhf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_sinhf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_finz_sinhf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_coshf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_finz_coshf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_tanhf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_finz_tanhf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_fastsinf8_u3500fma4(__m256); +IMPORT CONST __m256 Sleef_finz_fastsinf8_u3500fma4(__m256); +IMPORT CONST __m256 Sleef_fastcosf8_u3500fma4(__m256); +IMPORT CONST __m256 Sleef_finz_fastcosf8_u3500fma4(__m256); +IMPORT CONST __m256 Sleef_fastpowf8_u3500fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_fastpowf8_u3500fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_asinhf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_asinhf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_acoshf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_acoshf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_atanhf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_atanhf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_exp2f8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_exp2f8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_exp2f8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_finz_exp2f8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_exp10f8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_exp10f8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_exp10f8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_finz_exp10f8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_expm1f8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_expm1f8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_log10f8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_log10f8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_log2f8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_log2f8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_log2f8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_finz_log2f8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_log1pf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_log1pf8_u10fma4(__m256); +IMPORT CONST Sleef___m256_2 Sleef_sincospif8_u05fma4(__m256); +IMPORT CONST Sleef___m256_2 Sleef_finz_sincospif8_u05fma4(__m256); +IMPORT CONST Sleef___m256_2 Sleef_sincospif8_u35fma4(__m256); +IMPORT CONST Sleef___m256_2 Sleef_finz_sincospif8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_sinpif8_u05fma4(__m256); +IMPORT CONST __m256 Sleef_finz_sinpif8_u05fma4(__m256); +IMPORT CONST __m256 Sleef_cospif8_u05fma4(__m256); +IMPORT CONST __m256 Sleef_finz_cospif8_u05fma4(__m256); +IMPORT CONST __m256 Sleef_fmaf8_fma4(__m256, __m256, __m256); +IMPORT CONST __m256 Sleef_finz_fmaf8_fma4(__m256, __m256, __m256); +IMPORT CONST __m256 Sleef_sqrtf8_fma4(__m256); +IMPORT CONST __m256 Sleef_finz_sqrtf8_fma4(__m256); +IMPORT CONST __m256 Sleef_sqrtf8_u05fma4(__m256); +IMPORT CONST __m256 Sleef_finz_sqrtf8_u05fma4(__m256); +IMPORT CONST __m256 Sleef_sqrtf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_finz_sqrtf8_u35fma4(__m256); +IMPORT CONST __m256 Sleef_hypotf8_u05fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_hypotf8_u05fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_hypotf8_u35fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_hypotf8_u35fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_fabsf8_fma4(__m256); +IMPORT CONST __m256 Sleef_finz_fabsf8_fma4(__m256); +IMPORT CONST __m256 Sleef_copysignf8_fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_copysignf8_fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_fmaxf8_fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_fmaxf8_fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_fminf8_fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_fminf8_fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_fdimf8_fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_fdimf8_fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_truncf8_fma4(__m256); +IMPORT CONST __m256 Sleef_finz_truncf8_fma4(__m256); +IMPORT CONST __m256 Sleef_floorf8_fma4(__m256); +IMPORT CONST __m256 Sleef_finz_floorf8_fma4(__m256); +IMPORT CONST __m256 Sleef_ceilf8_fma4(__m256); +IMPORT CONST __m256 Sleef_finz_ceilf8_fma4(__m256); +IMPORT CONST __m256 Sleef_roundf8_fma4(__m256); +IMPORT CONST __m256 Sleef_finz_roundf8_fma4(__m256); +IMPORT CONST __m256 Sleef_rintf8_fma4(__m256); +IMPORT CONST __m256 Sleef_finz_rintf8_fma4(__m256); +IMPORT CONST __m256 Sleef_nextafterf8_fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_nextafterf8_fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_frfrexpf8_fma4(__m256); +IMPORT CONST __m256 Sleef_finz_frfrexpf8_fma4(__m256); +IMPORT CONST __m256 Sleef_fmodf8_fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_fmodf8_fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_remainderf8_fma4(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_remainderf8_fma4(__m256, __m256); +IMPORT CONST Sleef___m256_2 Sleef_modff8_fma4(__m256); +IMPORT CONST Sleef___m256_2 Sleef_finz_modff8_fma4(__m256); +IMPORT CONST __m256 Sleef_lgammaf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_lgammaf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_tgammaf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_tgammaf8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_erff8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_finz_erff8_u10fma4(__m256); +IMPORT CONST __m256 Sleef_erfcf8_u15fma4(__m256); +IMPORT CONST __m256 Sleef_finz_erfcf8_u15fma4(__m256); +IMPORT CONST int Sleef_getIntf8_fma4(int); +IMPORT CONST int Sleef_finz_getIntf8_fma4(int); +IMPORT CONST void *Sleef_getPtrf8_fma4(int); +IMPORT CONST void *Sleef_finz_getPtrf8_fma4(int); +#endif +#ifdef __AVX__ + +#ifndef Sleef___m256d_2_DEFINED +typedef struct { + __m256d x, y; +} Sleef___m256d_2; +#define Sleef___m256d_2_DEFINED +#endif + +IMPORT CONST __m256d Sleef_sind4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_sind4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_cosd4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_cosd4_u35avx2(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_sincosd4_u35avx2(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_finz_sincosd4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_tand4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_tand4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_asind4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_asind4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_acosd4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_acosd4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_atand4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_atand4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_atan2d4_u35avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_atan2d4_u35avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_logd4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_logd4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_cbrtd4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_cbrtd4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_sind4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_sind4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_cosd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_cosd4_u10avx2(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_sincosd4_u10avx2(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_finz_sincosd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_tand4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_tand4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_asind4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_asind4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_acosd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_acosd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_atand4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_atand4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_atan2d4_u10avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_atan2d4_u10avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_logd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_logd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_cbrtd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_cbrtd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_expd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_expd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_powd4_u10avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_powd4_u10avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_sinhd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_sinhd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_coshd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_coshd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_tanhd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_tanhd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_sinhd4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_sinhd4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_coshd4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_coshd4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_tanhd4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_tanhd4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_fastsind4_u3500avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_fastsind4_u3500avx2(__m256d); +IMPORT CONST __m256d Sleef_fastcosd4_u3500avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_fastcosd4_u3500avx2(__m256d); +IMPORT CONST __m256d Sleef_fastpowd4_u3500avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_fastpowd4_u3500avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_asinhd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_asinhd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_acoshd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_acoshd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_atanhd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_atanhd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_exp2d4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_exp2d4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_exp2d4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_exp2d4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_exp10d4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_exp10d4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_exp10d4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_exp10d4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_expm1d4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_expm1d4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_log10d4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_log10d4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_log2d4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_log2d4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_log2d4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_log2d4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_log1pd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_log1pd4_u10avx2(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_sincospid4_u05avx2(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_finz_sincospid4_u05avx2(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_sincospid4_u35avx2(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_finz_sincospid4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_sinpid4_u05avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_sinpid4_u05avx2(__m256d); +IMPORT CONST __m256d Sleef_cospid4_u05avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_cospid4_u05avx2(__m256d); +IMPORT CONST __m256d Sleef_ldexpd4_avx2(__m256d, __m128i); +IMPORT CONST __m256d Sleef_finz_ldexpd4_avx2(__m256d, __m128i); +IMPORT CONST __m128i Sleef_ilogbd4_avx2(__m256d); +IMPORT CONST __m128i Sleef_finz_ilogbd4_avx2(__m256d); +IMPORT CONST __m256d Sleef_fmad4_avx2(__m256d, __m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_fmad4_avx2(__m256d, __m256d, __m256d); +IMPORT CONST __m256d Sleef_sqrtd4_avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_sqrtd4_avx2(__m256d); +IMPORT CONST __m256d Sleef_sqrtd4_u05avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_sqrtd4_u05avx2(__m256d); +IMPORT CONST __m256d Sleef_sqrtd4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_sqrtd4_u35avx2(__m256d); +IMPORT CONST __m256d Sleef_hypotd4_u05avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_hypotd4_u05avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_hypotd4_u35avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_hypotd4_u35avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_fabsd4_avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_fabsd4_avx2(__m256d); +IMPORT CONST __m256d Sleef_copysignd4_avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_copysignd4_avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_fmaxd4_avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_fmaxd4_avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_fmind4_avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_fmind4_avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_fdimd4_avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_fdimd4_avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_truncd4_avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_truncd4_avx2(__m256d); +IMPORT CONST __m256d Sleef_floord4_avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_floord4_avx2(__m256d); +IMPORT CONST __m256d Sleef_ceild4_avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_ceild4_avx2(__m256d); +IMPORT CONST __m256d Sleef_roundd4_avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_roundd4_avx2(__m256d); +IMPORT CONST __m256d Sleef_rintd4_avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_rintd4_avx2(__m256d); +IMPORT CONST __m256d Sleef_nextafterd4_avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_nextafterd4_avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_frfrexpd4_avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_frfrexpd4_avx2(__m256d); +IMPORT CONST __m128i Sleef_expfrexpd4_avx2(__m256d); +IMPORT CONST __m128i Sleef_finz_expfrexpd4_avx2(__m256d); +IMPORT CONST __m256d Sleef_fmodd4_avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_fmodd4_avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_remainderd4_avx2(__m256d, __m256d); +IMPORT CONST __m256d Sleef_finz_remainderd4_avx2(__m256d, __m256d); +IMPORT CONST Sleef___m256d_2 Sleef_modfd4_avx2(__m256d); +IMPORT CONST Sleef___m256d_2 Sleef_finz_modfd4_avx2(__m256d); +IMPORT CONST __m256d Sleef_lgammad4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_lgammad4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_tgammad4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_tgammad4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_erfd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_erfd4_u10avx2(__m256d); +IMPORT CONST __m256d Sleef_erfcd4_u15avx2(__m256d); +IMPORT CONST __m256d Sleef_finz_erfcd4_u15avx2(__m256d); +IMPORT CONST int Sleef_getIntd4_avx2(int); +IMPORT CONST void *Sleef_getPtrd4_avx2(int); + +#ifndef Sleef___m256_2_DEFINED +typedef struct { + __m256 x, y; +} Sleef___m256_2; +#define Sleef___m256_2_DEFINED +#endif + +IMPORT CONST __m256 Sleef_sinf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_finz_sinf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_cosf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_finz_cosf8_u35avx2(__m256); +IMPORT CONST Sleef___m256_2 Sleef_sincosf8_u35avx2(__m256); +IMPORT CONST Sleef___m256_2 Sleef_finz_sincosf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_tanf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_finz_tanf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_asinf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_finz_asinf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_acosf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_finz_acosf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_atanf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_finz_atanf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_atan2f8_u35avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_atan2f8_u35avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_logf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_finz_logf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_cbrtf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_finz_cbrtf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_sinf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_sinf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_cosf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_cosf8_u10avx2(__m256); +IMPORT CONST Sleef___m256_2 Sleef_sincosf8_u10avx2(__m256); +IMPORT CONST Sleef___m256_2 Sleef_finz_sincosf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_tanf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_tanf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_asinf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_asinf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_acosf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_acosf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_atanf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_atanf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_atan2f8_u10avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_atan2f8_u10avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_logf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_logf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_cbrtf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_cbrtf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_expf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_expf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_powf8_u10avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_powf8_u10avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_sinhf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_sinhf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_coshf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_coshf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_tanhf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_tanhf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_sinhf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_finz_sinhf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_coshf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_finz_coshf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_tanhf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_finz_tanhf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_fastsinf8_u3500avx2(__m256); +IMPORT CONST __m256 Sleef_finz_fastsinf8_u3500avx2(__m256); +IMPORT CONST __m256 Sleef_fastcosf8_u3500avx2(__m256); +IMPORT CONST __m256 Sleef_finz_fastcosf8_u3500avx2(__m256); +IMPORT CONST __m256 Sleef_fastpowf8_u3500avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_fastpowf8_u3500avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_asinhf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_asinhf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_acoshf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_acoshf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_atanhf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_atanhf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_exp2f8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_exp2f8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_exp2f8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_finz_exp2f8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_exp10f8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_exp10f8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_exp10f8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_finz_exp10f8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_expm1f8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_expm1f8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_log10f8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_log10f8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_log2f8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_log2f8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_log2f8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_finz_log2f8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_log1pf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_log1pf8_u10avx2(__m256); +IMPORT CONST Sleef___m256_2 Sleef_sincospif8_u05avx2(__m256); +IMPORT CONST Sleef___m256_2 Sleef_finz_sincospif8_u05avx2(__m256); +IMPORT CONST Sleef___m256_2 Sleef_sincospif8_u35avx2(__m256); +IMPORT CONST Sleef___m256_2 Sleef_finz_sincospif8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_sinpif8_u05avx2(__m256); +IMPORT CONST __m256 Sleef_finz_sinpif8_u05avx2(__m256); +IMPORT CONST __m256 Sleef_cospif8_u05avx2(__m256); +IMPORT CONST __m256 Sleef_finz_cospif8_u05avx2(__m256); +IMPORT CONST __m256 Sleef_fmaf8_avx2(__m256, __m256, __m256); +IMPORT CONST __m256 Sleef_finz_fmaf8_avx2(__m256, __m256, __m256); +IMPORT CONST __m256 Sleef_sqrtf8_avx2(__m256); +IMPORT CONST __m256 Sleef_finz_sqrtf8_avx2(__m256); +IMPORT CONST __m256 Sleef_sqrtf8_u05avx2(__m256); +IMPORT CONST __m256 Sleef_finz_sqrtf8_u05avx2(__m256); +IMPORT CONST __m256 Sleef_sqrtf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_finz_sqrtf8_u35avx2(__m256); +IMPORT CONST __m256 Sleef_hypotf8_u05avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_hypotf8_u05avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_hypotf8_u35avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_hypotf8_u35avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_fabsf8_avx2(__m256); +IMPORT CONST __m256 Sleef_finz_fabsf8_avx2(__m256); +IMPORT CONST __m256 Sleef_copysignf8_avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_copysignf8_avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_fmaxf8_avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_fmaxf8_avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_fminf8_avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_fminf8_avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_fdimf8_avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_fdimf8_avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_truncf8_avx2(__m256); +IMPORT CONST __m256 Sleef_finz_truncf8_avx2(__m256); +IMPORT CONST __m256 Sleef_floorf8_avx2(__m256); +IMPORT CONST __m256 Sleef_finz_floorf8_avx2(__m256); +IMPORT CONST __m256 Sleef_ceilf8_avx2(__m256); +IMPORT CONST __m256 Sleef_finz_ceilf8_avx2(__m256); +IMPORT CONST __m256 Sleef_roundf8_avx2(__m256); +IMPORT CONST __m256 Sleef_finz_roundf8_avx2(__m256); +IMPORT CONST __m256 Sleef_rintf8_avx2(__m256); +IMPORT CONST __m256 Sleef_finz_rintf8_avx2(__m256); +IMPORT CONST __m256 Sleef_nextafterf8_avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_nextafterf8_avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_frfrexpf8_avx2(__m256); +IMPORT CONST __m256 Sleef_finz_frfrexpf8_avx2(__m256); +IMPORT CONST __m256 Sleef_fmodf8_avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_fmodf8_avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_remainderf8_avx2(__m256, __m256); +IMPORT CONST __m256 Sleef_finz_remainderf8_avx2(__m256, __m256); +IMPORT CONST Sleef___m256_2 Sleef_modff8_avx2(__m256); +IMPORT CONST Sleef___m256_2 Sleef_finz_modff8_avx2(__m256); +IMPORT CONST __m256 Sleef_lgammaf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_lgammaf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_tgammaf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_tgammaf8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_erff8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_finz_erff8_u10avx2(__m256); +IMPORT CONST __m256 Sleef_erfcf8_u15avx2(__m256); +IMPORT CONST __m256 Sleef_finz_erfcf8_u15avx2(__m256); +IMPORT CONST int Sleef_getIntf8_avx2(int); +IMPORT CONST int Sleef_finz_getIntf8_avx2(int); +IMPORT CONST void *Sleef_getPtrf8_avx2(int); +IMPORT CONST void *Sleef_finz_getPtrf8_avx2(int); +#endif +#ifdef __SSE2__ + +#ifndef Sleef___m128d_2_DEFINED +typedef struct { + __m128d x, y; +} Sleef___m128d_2; +#define Sleef___m128d_2_DEFINED +#endif + +IMPORT CONST __m128d Sleef_sind2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_sind2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_cosd2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_cosd2_u35avx2128(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_sincosd2_u35avx2128(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_finz_sincosd2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_tand2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_tand2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_asind2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_asind2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_acosd2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_acosd2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_atand2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_atand2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_atan2d2_u35avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_finz_atan2d2_u35avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_logd2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_logd2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_cbrtd2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_cbrtd2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_sind2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_sind2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_cosd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_cosd2_u10avx2128(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_sincosd2_u10avx2128(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_finz_sincosd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_tand2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_tand2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_asind2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_asind2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_acosd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_acosd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_atand2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_atand2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_atan2d2_u10avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_finz_atan2d2_u10avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_logd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_logd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_cbrtd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_cbrtd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_expd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_expd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_powd2_u10avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_finz_powd2_u10avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_sinhd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_sinhd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_coshd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_coshd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_tanhd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_tanhd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_sinhd2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_sinhd2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_coshd2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_coshd2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_tanhd2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_tanhd2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_fastsind2_u3500avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_fastsind2_u3500avx2128(__m128d); +IMPORT CONST __m128d Sleef_fastcosd2_u3500avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_fastcosd2_u3500avx2128(__m128d); +IMPORT CONST __m128d Sleef_fastpowd2_u3500avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_finz_fastpowd2_u3500avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_asinhd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_asinhd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_acoshd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_acoshd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_atanhd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_atanhd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_exp2d2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_exp2d2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_exp2d2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_exp2d2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_exp10d2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_exp10d2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_exp10d2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_exp10d2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_expm1d2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_expm1d2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_log10d2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_log10d2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_log2d2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_log2d2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_log2d2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_log2d2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_log1pd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_log1pd2_u10avx2128(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_sincospid2_u05avx2128(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_finz_sincospid2_u05avx2128(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_sincospid2_u35avx2128(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_finz_sincospid2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_sinpid2_u05avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_sinpid2_u05avx2128(__m128d); +IMPORT CONST __m128d Sleef_cospid2_u05avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_cospid2_u05avx2128(__m128d); +IMPORT CONST __m128d Sleef_ldexpd2_avx2128(__m128d, __m128i); +IMPORT CONST __m128d Sleef_finz_ldexpd2_avx2128(__m128d, __m128i); +IMPORT CONST __m128i Sleef_ilogbd2_avx2128(__m128d); +IMPORT CONST __m128i Sleef_finz_ilogbd2_avx2128(__m128d); +IMPORT CONST __m128d Sleef_fmad2_avx2128(__m128d, __m128d, __m128d); +IMPORT CONST __m128d Sleef_finz_fmad2_avx2128(__m128d, __m128d, __m128d); +IMPORT CONST __m128d Sleef_sqrtd2_avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_sqrtd2_avx2128(__m128d); +IMPORT CONST __m128d Sleef_sqrtd2_u05avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_sqrtd2_u05avx2128(__m128d); +IMPORT CONST __m128d Sleef_sqrtd2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_sqrtd2_u35avx2128(__m128d); +IMPORT CONST __m128d Sleef_hypotd2_u05avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_finz_hypotd2_u05avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_hypotd2_u35avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_finz_hypotd2_u35avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_fabsd2_avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_fabsd2_avx2128(__m128d); +IMPORT CONST __m128d Sleef_copysignd2_avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_finz_copysignd2_avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_fmaxd2_avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_finz_fmaxd2_avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_fmind2_avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_finz_fmind2_avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_fdimd2_avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_finz_fdimd2_avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_truncd2_avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_truncd2_avx2128(__m128d); +IMPORT CONST __m128d Sleef_floord2_avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_floord2_avx2128(__m128d); +IMPORT CONST __m128d Sleef_ceild2_avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_ceild2_avx2128(__m128d); +IMPORT CONST __m128d Sleef_roundd2_avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_roundd2_avx2128(__m128d); +IMPORT CONST __m128d Sleef_rintd2_avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_rintd2_avx2128(__m128d); +IMPORT CONST __m128d Sleef_nextafterd2_avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_finz_nextafterd2_avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_frfrexpd2_avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_frfrexpd2_avx2128(__m128d); +IMPORT CONST __m128i Sleef_expfrexpd2_avx2128(__m128d); +IMPORT CONST __m128i Sleef_finz_expfrexpd2_avx2128(__m128d); +IMPORT CONST __m128d Sleef_fmodd2_avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_finz_fmodd2_avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_remainderd2_avx2128(__m128d, __m128d); +IMPORT CONST __m128d Sleef_finz_remainderd2_avx2128(__m128d, __m128d); +IMPORT CONST Sleef___m128d_2 Sleef_modfd2_avx2128(__m128d); +IMPORT CONST Sleef___m128d_2 Sleef_finz_modfd2_avx2128(__m128d); +IMPORT CONST __m128d Sleef_lgammad2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_lgammad2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_tgammad2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_tgammad2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_erfd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_erfd2_u10avx2128(__m128d); +IMPORT CONST __m128d Sleef_erfcd2_u15avx2128(__m128d); +IMPORT CONST __m128d Sleef_finz_erfcd2_u15avx2128(__m128d); +IMPORT CONST int Sleef_getIntd2_avx2128(int); +IMPORT CONST void *Sleef_getPtrd2_avx2128(int); + +#ifndef Sleef___m128_2_DEFINED +typedef struct { + __m128 x, y; +} Sleef___m128_2; +#define Sleef___m128_2_DEFINED +#endif + +IMPORT CONST __m128 Sleef_sinf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_sinf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_cosf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_cosf4_u35avx2128(__m128); +IMPORT CONST Sleef___m128_2 Sleef_sincosf4_u35avx2128(__m128); +IMPORT CONST Sleef___m128_2 Sleef_finz_sincosf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_tanf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_tanf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_asinf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_asinf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_acosf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_acosf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_atanf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_atanf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_atan2f4_u35avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_finz_atan2f4_u35avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_logf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_logf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_cbrtf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_cbrtf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_sinf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_sinf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_cosf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_cosf4_u10avx2128(__m128); +IMPORT CONST Sleef___m128_2 Sleef_sincosf4_u10avx2128(__m128); +IMPORT CONST Sleef___m128_2 Sleef_finz_sincosf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_tanf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_tanf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_asinf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_asinf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_acosf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_acosf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_atanf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_atanf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_atan2f4_u10avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_finz_atan2f4_u10avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_logf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_logf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_cbrtf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_cbrtf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_expf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_expf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_powf4_u10avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_finz_powf4_u10avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_sinhf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_sinhf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_coshf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_coshf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_tanhf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_tanhf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_sinhf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_sinhf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_coshf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_coshf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_tanhf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_tanhf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_fastsinf4_u3500avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_fastsinf4_u3500avx2128(__m128); +IMPORT CONST __m128 Sleef_fastcosf4_u3500avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_fastcosf4_u3500avx2128(__m128); +IMPORT CONST __m128 Sleef_fastpowf4_u3500avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_finz_fastpowf4_u3500avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_asinhf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_asinhf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_acoshf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_acoshf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_atanhf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_atanhf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_exp2f4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_exp2f4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_exp2f4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_exp2f4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_exp10f4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_exp10f4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_exp10f4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_exp10f4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_expm1f4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_expm1f4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_log10f4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_log10f4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_log2f4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_log2f4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_log2f4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_log2f4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_log1pf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_log1pf4_u10avx2128(__m128); +IMPORT CONST Sleef___m128_2 Sleef_sincospif4_u05avx2128(__m128); +IMPORT CONST Sleef___m128_2 Sleef_finz_sincospif4_u05avx2128(__m128); +IMPORT CONST Sleef___m128_2 Sleef_sincospif4_u35avx2128(__m128); +IMPORT CONST Sleef___m128_2 Sleef_finz_sincospif4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_sinpif4_u05avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_sinpif4_u05avx2128(__m128); +IMPORT CONST __m128 Sleef_cospif4_u05avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_cospif4_u05avx2128(__m128); +IMPORT CONST __m128 Sleef_fmaf4_avx2128(__m128, __m128, __m128); +IMPORT CONST __m128 Sleef_finz_fmaf4_avx2128(__m128, __m128, __m128); +IMPORT CONST __m128 Sleef_sqrtf4_avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_sqrtf4_avx2128(__m128); +IMPORT CONST __m128 Sleef_sqrtf4_u05avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_sqrtf4_u05avx2128(__m128); +IMPORT CONST __m128 Sleef_sqrtf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_sqrtf4_u35avx2128(__m128); +IMPORT CONST __m128 Sleef_hypotf4_u05avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_finz_hypotf4_u05avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_hypotf4_u35avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_finz_hypotf4_u35avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_fabsf4_avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_fabsf4_avx2128(__m128); +IMPORT CONST __m128 Sleef_copysignf4_avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_finz_copysignf4_avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_fmaxf4_avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_finz_fmaxf4_avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_fminf4_avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_finz_fminf4_avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_fdimf4_avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_finz_fdimf4_avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_truncf4_avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_truncf4_avx2128(__m128); +IMPORT CONST __m128 Sleef_floorf4_avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_floorf4_avx2128(__m128); +IMPORT CONST __m128 Sleef_ceilf4_avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_ceilf4_avx2128(__m128); +IMPORT CONST __m128 Sleef_roundf4_avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_roundf4_avx2128(__m128); +IMPORT CONST __m128 Sleef_rintf4_avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_rintf4_avx2128(__m128); +IMPORT CONST __m128 Sleef_nextafterf4_avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_finz_nextafterf4_avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_frfrexpf4_avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_frfrexpf4_avx2128(__m128); +IMPORT CONST __m128 Sleef_fmodf4_avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_finz_fmodf4_avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_remainderf4_avx2128(__m128, __m128); +IMPORT CONST __m128 Sleef_finz_remainderf4_avx2128(__m128, __m128); +IMPORT CONST Sleef___m128_2 Sleef_modff4_avx2128(__m128); +IMPORT CONST Sleef___m128_2 Sleef_finz_modff4_avx2128(__m128); +IMPORT CONST __m128 Sleef_lgammaf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_lgammaf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_tgammaf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_tgammaf4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_erff4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_erff4_u10avx2128(__m128); +IMPORT CONST __m128 Sleef_erfcf4_u15avx2128(__m128); +IMPORT CONST __m128 Sleef_finz_erfcf4_u15avx2128(__m128); +IMPORT CONST int Sleef_getIntf4_avx2128(int); +IMPORT CONST int Sleef_finz_getIntf4_avx2128(int); +IMPORT CONST void *Sleef_getPtrf4_avx2128(int); +IMPORT CONST void *Sleef_finz_getPtrf4_avx2128(int); +#endif +#ifdef __AVX512F__ + +#ifndef Sleef___m512d_2_DEFINED +typedef struct { + __m512d x, y; +} Sleef___m512d_2; +#define Sleef___m512d_2_DEFINED +#endif + +IMPORT CONST __m512d Sleef_sind8_u35(__m512d); +IMPORT CONST __m512d Sleef_finz_sind8_u35(__m512d); +IMPORT CONST __m512d Sleef_cosd8_u35(__m512d); +IMPORT CONST __m512d Sleef_finz_cosd8_u35(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_sincosd8_u35(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_finz_sincosd8_u35(__m512d); +IMPORT CONST __m512d Sleef_tand8_u35(__m512d); +IMPORT CONST __m512d Sleef_finz_tand8_u35(__m512d); +IMPORT CONST __m512d Sleef_asind8_u35(__m512d); +IMPORT CONST __m512d Sleef_finz_asind8_u35(__m512d); +IMPORT CONST __m512d Sleef_acosd8_u35(__m512d); +IMPORT CONST __m512d Sleef_finz_acosd8_u35(__m512d); +IMPORT CONST __m512d Sleef_atand8_u35(__m512d); +IMPORT CONST __m512d Sleef_finz_atand8_u35(__m512d); +IMPORT CONST __m512d Sleef_atan2d8_u35(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_atan2d8_u35(__m512d, __m512d); +IMPORT CONST __m512d Sleef_logd8_u35(__m512d); +IMPORT CONST __m512d Sleef_finz_logd8_u35(__m512d); +IMPORT CONST __m512d Sleef_cbrtd8_u35(__m512d); +IMPORT CONST __m512d Sleef_finz_cbrtd8_u35(__m512d); +IMPORT CONST __m512d Sleef_sind8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_sind8_u10(__m512d); +IMPORT CONST __m512d Sleef_cosd8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_cosd8_u10(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_sincosd8_u10(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_finz_sincosd8_u10(__m512d); +IMPORT CONST __m512d Sleef_tand8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_tand8_u10(__m512d); +IMPORT CONST __m512d Sleef_asind8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_asind8_u10(__m512d); +IMPORT CONST __m512d Sleef_acosd8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_acosd8_u10(__m512d); +IMPORT CONST __m512d Sleef_atand8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_atand8_u10(__m512d); +IMPORT CONST __m512d Sleef_atan2d8_u10(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_atan2d8_u10(__m512d, __m512d); +IMPORT CONST __m512d Sleef_logd8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_logd8_u10(__m512d); +IMPORT CONST __m512d Sleef_cbrtd8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_cbrtd8_u10(__m512d); +IMPORT CONST __m512d Sleef_expd8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_expd8_u10(__m512d); +IMPORT CONST __m512d Sleef_powd8_u10(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_powd8_u10(__m512d, __m512d); +IMPORT CONST __m512d Sleef_sinhd8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_sinhd8_u10(__m512d); +IMPORT CONST __m512d Sleef_coshd8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_coshd8_u10(__m512d); +IMPORT CONST __m512d Sleef_tanhd8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_tanhd8_u10(__m512d); +IMPORT CONST __m512d Sleef_sinhd8_u35(__m512d); +IMPORT CONST __m512d Sleef_finz_sinhd8_u35(__m512d); +IMPORT CONST __m512d Sleef_coshd8_u35(__m512d); +IMPORT CONST __m512d Sleef_finz_coshd8_u35(__m512d); +IMPORT CONST __m512d Sleef_tanhd8_u35(__m512d); +IMPORT CONST __m512d Sleef_finz_tanhd8_u35(__m512d); +IMPORT CONST __m512d Sleef_fastsind8_u3500(__m512d); +IMPORT CONST __m512d Sleef_finz_fastsind8_u3500(__m512d); +IMPORT CONST __m512d Sleef_fastcosd8_u3500(__m512d); +IMPORT CONST __m512d Sleef_finz_fastcosd8_u3500(__m512d); +IMPORT CONST __m512d Sleef_fastpowd8_u3500(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_fastpowd8_u3500(__m512d, __m512d); +IMPORT CONST __m512d Sleef_asinhd8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_asinhd8_u10(__m512d); +IMPORT CONST __m512d Sleef_acoshd8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_acoshd8_u10(__m512d); +IMPORT CONST __m512d Sleef_atanhd8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_atanhd8_u10(__m512d); +IMPORT CONST __m512d Sleef_exp2d8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_exp2d8_u10(__m512d); +IMPORT CONST __m512d Sleef_exp2d8_u35(__m512d); +IMPORT CONST __m512d Sleef_finz_exp2d8_u35(__m512d); +IMPORT CONST __m512d Sleef_exp10d8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_exp10d8_u10(__m512d); +IMPORT CONST __m512d Sleef_exp10d8_u35(__m512d); +IMPORT CONST __m512d Sleef_finz_exp10d8_u35(__m512d); +IMPORT CONST __m512d Sleef_expm1d8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_expm1d8_u10(__m512d); +IMPORT CONST __m512d Sleef_log10d8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_log10d8_u10(__m512d); +IMPORT CONST __m512d Sleef_log2d8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_log2d8_u10(__m512d); +IMPORT CONST __m512d Sleef_log2d8_u35(__m512d); +IMPORT CONST __m512d Sleef_finz_log2d8_u35(__m512d); +IMPORT CONST __m512d Sleef_log1pd8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_log1pd8_u10(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_sincospid8_u05(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_finz_sincospid8_u05(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_sincospid8_u35(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_finz_sincospid8_u35(__m512d); +IMPORT CONST __m512d Sleef_sinpid8_u05(__m512d); +IMPORT CONST __m512d Sleef_finz_sinpid8_u05(__m512d); +IMPORT CONST __m512d Sleef_cospid8_u05(__m512d); +IMPORT CONST __m512d Sleef_finz_cospid8_u05(__m512d); +IMPORT CONST __m512d Sleef_ldexpd8(__m512d, __m256i); +IMPORT CONST __m512d Sleef_finz_ldexpd8(__m512d, __m256i); +IMPORT CONST __m256i Sleef_ilogbd8(__m512d); +IMPORT CONST __m256i Sleef_finz_ilogbd8(__m512d); +IMPORT CONST __m512d Sleef_fmad8(__m512d, __m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_fmad8(__m512d, __m512d, __m512d); +IMPORT CONST __m512d Sleef_sqrtd8(__m512d); +IMPORT CONST __m512d Sleef_finz_sqrtd8(__m512d); +IMPORT CONST __m512d Sleef_sqrtd8_u05(__m512d); +IMPORT CONST __m512d Sleef_finz_sqrtd8_u05(__m512d); +IMPORT CONST __m512d Sleef_sqrtd8_u35(__m512d); +IMPORT CONST __m512d Sleef_finz_sqrtd8_u35(__m512d); +IMPORT CONST __m512d Sleef_hypotd8_u05(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_hypotd8_u05(__m512d, __m512d); +IMPORT CONST __m512d Sleef_hypotd8_u35(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_hypotd8_u35(__m512d, __m512d); +IMPORT CONST __m512d Sleef_fabsd8(__m512d); +IMPORT CONST __m512d Sleef_finz_fabsd8(__m512d); +IMPORT CONST __m512d Sleef_copysignd8(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_copysignd8(__m512d, __m512d); +IMPORT CONST __m512d Sleef_fmaxd8(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_fmaxd8(__m512d, __m512d); +IMPORT CONST __m512d Sleef_fmind8(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_fmind8(__m512d, __m512d); +IMPORT CONST __m512d Sleef_fdimd8(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_fdimd8(__m512d, __m512d); +IMPORT CONST __m512d Sleef_truncd8(__m512d); +IMPORT CONST __m512d Sleef_finz_truncd8(__m512d); +IMPORT CONST __m512d Sleef_floord8(__m512d); +IMPORT CONST __m512d Sleef_finz_floord8(__m512d); +IMPORT CONST __m512d Sleef_ceild8(__m512d); +IMPORT CONST __m512d Sleef_finz_ceild8(__m512d); +IMPORT CONST __m512d Sleef_roundd8(__m512d); +IMPORT CONST __m512d Sleef_finz_roundd8(__m512d); +IMPORT CONST __m512d Sleef_rintd8(__m512d); +IMPORT CONST __m512d Sleef_finz_rintd8(__m512d); +IMPORT CONST __m512d Sleef_nextafterd8(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_nextafterd8(__m512d, __m512d); +IMPORT CONST __m512d Sleef_frfrexpd8(__m512d); +IMPORT CONST __m512d Sleef_finz_frfrexpd8(__m512d); +IMPORT CONST __m256i Sleef_expfrexpd8(__m512d); +IMPORT CONST __m256i Sleef_finz_expfrexpd8(__m512d); +IMPORT CONST __m512d Sleef_fmodd8(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_fmodd8(__m512d, __m512d); +IMPORT CONST __m512d Sleef_remainderd8(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_remainderd8(__m512d, __m512d); +IMPORT CONST Sleef___m512d_2 Sleef_modfd8(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_finz_modfd8(__m512d); +IMPORT CONST __m512d Sleef_lgammad8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_lgammad8_u10(__m512d); +IMPORT CONST __m512d Sleef_tgammad8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_tgammad8_u10(__m512d); +IMPORT CONST __m512d Sleef_erfd8_u10(__m512d); +IMPORT CONST __m512d Sleef_finz_erfd8_u10(__m512d); +IMPORT CONST __m512d Sleef_erfcd8_u15(__m512d); +IMPORT CONST __m512d Sleef_finz_erfcd8_u15(__m512d); +IMPORT CONST int Sleef_getIntd8(int); +IMPORT CONST void *Sleef_getPtrd8(int); + +#ifndef Sleef___m512_2_DEFINED +typedef struct { + __m512 x, y; +} Sleef___m512_2; +#define Sleef___m512_2_DEFINED +#endif + +IMPORT CONST __m512 Sleef_sinf16_u35(__m512); +IMPORT CONST __m512 Sleef_finz_sinf16_u35(__m512); +IMPORT CONST __m512 Sleef_cosf16_u35(__m512); +IMPORT CONST __m512 Sleef_finz_cosf16_u35(__m512); +IMPORT CONST Sleef___m512_2 Sleef_sincosf16_u35(__m512); +IMPORT CONST Sleef___m512_2 Sleef_finz_sincosf16_u35(__m512); +IMPORT CONST __m512 Sleef_tanf16_u35(__m512); +IMPORT CONST __m512 Sleef_finz_tanf16_u35(__m512); +IMPORT CONST __m512 Sleef_asinf16_u35(__m512); +IMPORT CONST __m512 Sleef_finz_asinf16_u35(__m512); +IMPORT CONST __m512 Sleef_acosf16_u35(__m512); +IMPORT CONST __m512 Sleef_finz_acosf16_u35(__m512); +IMPORT CONST __m512 Sleef_atanf16_u35(__m512); +IMPORT CONST __m512 Sleef_finz_atanf16_u35(__m512); +IMPORT CONST __m512 Sleef_atan2f16_u35(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_atan2f16_u35(__m512, __m512); +IMPORT CONST __m512 Sleef_logf16_u35(__m512); +IMPORT CONST __m512 Sleef_finz_logf16_u35(__m512); +IMPORT CONST __m512 Sleef_cbrtf16_u35(__m512); +IMPORT CONST __m512 Sleef_finz_cbrtf16_u35(__m512); +IMPORT CONST __m512 Sleef_sinf16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_sinf16_u10(__m512); +IMPORT CONST __m512 Sleef_cosf16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_cosf16_u10(__m512); +IMPORT CONST Sleef___m512_2 Sleef_sincosf16_u10(__m512); +IMPORT CONST Sleef___m512_2 Sleef_finz_sincosf16_u10(__m512); +IMPORT CONST __m512 Sleef_tanf16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_tanf16_u10(__m512); +IMPORT CONST __m512 Sleef_asinf16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_asinf16_u10(__m512); +IMPORT CONST __m512 Sleef_acosf16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_acosf16_u10(__m512); +IMPORT CONST __m512 Sleef_atanf16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_atanf16_u10(__m512); +IMPORT CONST __m512 Sleef_atan2f16_u10(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_atan2f16_u10(__m512, __m512); +IMPORT CONST __m512 Sleef_logf16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_logf16_u10(__m512); +IMPORT CONST __m512 Sleef_cbrtf16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_cbrtf16_u10(__m512); +IMPORT CONST __m512 Sleef_expf16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_expf16_u10(__m512); +IMPORT CONST __m512 Sleef_powf16_u10(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_powf16_u10(__m512, __m512); +IMPORT CONST __m512 Sleef_sinhf16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_sinhf16_u10(__m512); +IMPORT CONST __m512 Sleef_coshf16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_coshf16_u10(__m512); +IMPORT CONST __m512 Sleef_tanhf16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_tanhf16_u10(__m512); +IMPORT CONST __m512 Sleef_sinhf16_u35(__m512); +IMPORT CONST __m512 Sleef_finz_sinhf16_u35(__m512); +IMPORT CONST __m512 Sleef_coshf16_u35(__m512); +IMPORT CONST __m512 Sleef_finz_coshf16_u35(__m512); +IMPORT CONST __m512 Sleef_tanhf16_u35(__m512); +IMPORT CONST __m512 Sleef_finz_tanhf16_u35(__m512); +IMPORT CONST __m512 Sleef_fastsinf16_u3500(__m512); +IMPORT CONST __m512 Sleef_finz_fastsinf16_u3500(__m512); +IMPORT CONST __m512 Sleef_fastcosf16_u3500(__m512); +IMPORT CONST __m512 Sleef_finz_fastcosf16_u3500(__m512); +IMPORT CONST __m512 Sleef_fastpowf16_u3500(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_fastpowf16_u3500(__m512, __m512); +IMPORT CONST __m512 Sleef_asinhf16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_asinhf16_u10(__m512); +IMPORT CONST __m512 Sleef_acoshf16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_acoshf16_u10(__m512); +IMPORT CONST __m512 Sleef_atanhf16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_atanhf16_u10(__m512); +IMPORT CONST __m512 Sleef_exp2f16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_exp2f16_u10(__m512); +IMPORT CONST __m512 Sleef_exp2f16_u35(__m512); +IMPORT CONST __m512 Sleef_finz_exp2f16_u35(__m512); +IMPORT CONST __m512 Sleef_exp10f16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_exp10f16_u10(__m512); +IMPORT CONST __m512 Sleef_exp10f16_u35(__m512); +IMPORT CONST __m512 Sleef_finz_exp10f16_u35(__m512); +IMPORT CONST __m512 Sleef_expm1f16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_expm1f16_u10(__m512); +IMPORT CONST __m512 Sleef_log10f16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_log10f16_u10(__m512); +IMPORT CONST __m512 Sleef_log2f16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_log2f16_u10(__m512); +IMPORT CONST __m512 Sleef_log2f16_u35(__m512); +IMPORT CONST __m512 Sleef_finz_log2f16_u35(__m512); +IMPORT CONST __m512 Sleef_log1pf16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_log1pf16_u10(__m512); +IMPORT CONST Sleef___m512_2 Sleef_sincospif16_u05(__m512); +IMPORT CONST Sleef___m512_2 Sleef_finz_sincospif16_u05(__m512); +IMPORT CONST Sleef___m512_2 Sleef_sincospif16_u35(__m512); +IMPORT CONST Sleef___m512_2 Sleef_finz_sincospif16_u35(__m512); +IMPORT CONST __m512 Sleef_sinpif16_u05(__m512); +IMPORT CONST __m512 Sleef_finz_sinpif16_u05(__m512); +IMPORT CONST __m512 Sleef_cospif16_u05(__m512); +IMPORT CONST __m512 Sleef_finz_cospif16_u05(__m512); +IMPORT CONST __m512 Sleef_fmaf16(__m512, __m512, __m512); +IMPORT CONST __m512 Sleef_finz_fmaf16(__m512, __m512, __m512); +IMPORT CONST __m512 Sleef_sqrtf16(__m512); +IMPORT CONST __m512 Sleef_finz_sqrtf16(__m512); +IMPORT CONST __m512 Sleef_sqrtf16_u05(__m512); +IMPORT CONST __m512 Sleef_finz_sqrtf16_u05(__m512); +IMPORT CONST __m512 Sleef_sqrtf16_u35(__m512); +IMPORT CONST __m512 Sleef_finz_sqrtf16_u35(__m512); +IMPORT CONST __m512 Sleef_hypotf16_u05(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_hypotf16_u05(__m512, __m512); +IMPORT CONST __m512 Sleef_hypotf16_u35(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_hypotf16_u35(__m512, __m512); +IMPORT CONST __m512 Sleef_fabsf16(__m512); +IMPORT CONST __m512 Sleef_finz_fabsf16(__m512); +IMPORT CONST __m512 Sleef_copysignf16(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_copysignf16(__m512, __m512); +IMPORT CONST __m512 Sleef_fmaxf16(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_fmaxf16(__m512, __m512); +IMPORT CONST __m512 Sleef_fminf16(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_fminf16(__m512, __m512); +IMPORT CONST __m512 Sleef_fdimf16(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_fdimf16(__m512, __m512); +IMPORT CONST __m512 Sleef_truncf16(__m512); +IMPORT CONST __m512 Sleef_finz_truncf16(__m512); +IMPORT CONST __m512 Sleef_floorf16(__m512); +IMPORT CONST __m512 Sleef_finz_floorf16(__m512); +IMPORT CONST __m512 Sleef_ceilf16(__m512); +IMPORT CONST __m512 Sleef_finz_ceilf16(__m512); +IMPORT CONST __m512 Sleef_roundf16(__m512); +IMPORT CONST __m512 Sleef_finz_roundf16(__m512); +IMPORT CONST __m512 Sleef_rintf16(__m512); +IMPORT CONST __m512 Sleef_finz_rintf16(__m512); +IMPORT CONST __m512 Sleef_nextafterf16(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_nextafterf16(__m512, __m512); +IMPORT CONST __m512 Sleef_frfrexpf16(__m512); +IMPORT CONST __m512 Sleef_finz_frfrexpf16(__m512); +IMPORT CONST __m512 Sleef_fmodf16(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_fmodf16(__m512, __m512); +IMPORT CONST __m512 Sleef_remainderf16(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_remainderf16(__m512, __m512); +IMPORT CONST Sleef___m512_2 Sleef_modff16(__m512); +IMPORT CONST Sleef___m512_2 Sleef_finz_modff16(__m512); +IMPORT CONST __m512 Sleef_lgammaf16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_lgammaf16_u10(__m512); +IMPORT CONST __m512 Sleef_tgammaf16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_tgammaf16_u10(__m512); +IMPORT CONST __m512 Sleef_erff16_u10(__m512); +IMPORT CONST __m512 Sleef_finz_erff16_u10(__m512); +IMPORT CONST __m512 Sleef_erfcf16_u15(__m512); +IMPORT CONST __m512 Sleef_finz_erfcf16_u15(__m512); +IMPORT CONST int Sleef_getIntf16(int); +IMPORT CONST int Sleef_finz_getIntf16(int); +IMPORT CONST void *Sleef_getPtrf16(int); +IMPORT CONST void *Sleef_finz_getPtrf16(int); +#endif +#ifdef __AVX512F__ + +#ifndef Sleef___m512d_2_DEFINED +typedef struct { + __m512d x, y; +} Sleef___m512d_2; +#define Sleef___m512d_2_DEFINED +#endif + +IMPORT CONST __m512d Sleef_sind8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_sind8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_cosd8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_cosd8_u35avx512f(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_sincosd8_u35avx512f(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_finz_sincosd8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_tand8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_tand8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_asind8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_asind8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_acosd8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_acosd8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_atand8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_atand8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_atan2d8_u35avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_atan2d8_u35avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_logd8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_logd8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_cbrtd8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_cbrtd8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_sind8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_sind8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_cosd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_cosd8_u10avx512f(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_sincosd8_u10avx512f(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_finz_sincosd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_tand8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_tand8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_asind8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_asind8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_acosd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_acosd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_atand8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_atand8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_atan2d8_u10avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_atan2d8_u10avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_logd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_logd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_cbrtd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_cbrtd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_expd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_expd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_powd8_u10avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_powd8_u10avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_sinhd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_sinhd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_coshd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_coshd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_tanhd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_tanhd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_sinhd8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_sinhd8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_coshd8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_coshd8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_tanhd8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_tanhd8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_fastsind8_u3500avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_fastsind8_u3500avx512f(__m512d); +IMPORT CONST __m512d Sleef_fastcosd8_u3500avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_fastcosd8_u3500avx512f(__m512d); +IMPORT CONST __m512d Sleef_fastpowd8_u3500avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_fastpowd8_u3500avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_asinhd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_asinhd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_acoshd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_acoshd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_atanhd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_atanhd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_exp2d8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_exp2d8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_exp2d8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_exp2d8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_exp10d8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_exp10d8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_exp10d8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_exp10d8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_expm1d8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_expm1d8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_log10d8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_log10d8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_log2d8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_log2d8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_log2d8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_log2d8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_log1pd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_log1pd8_u10avx512f(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_sincospid8_u05avx512f(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_finz_sincospid8_u05avx512f(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_sincospid8_u35avx512f(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_finz_sincospid8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_sinpid8_u05avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_sinpid8_u05avx512f(__m512d); +IMPORT CONST __m512d Sleef_cospid8_u05avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_cospid8_u05avx512f(__m512d); +IMPORT CONST __m512d Sleef_ldexpd8_avx512f(__m512d, __m256i); +IMPORT CONST __m512d Sleef_finz_ldexpd8_avx512f(__m512d, __m256i); +IMPORT CONST __m256i Sleef_ilogbd8_avx512f(__m512d); +IMPORT CONST __m256i Sleef_finz_ilogbd8_avx512f(__m512d); +IMPORT CONST __m512d Sleef_fmad8_avx512f(__m512d, __m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_fmad8_avx512f(__m512d, __m512d, __m512d); +IMPORT CONST __m512d Sleef_sqrtd8_avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_sqrtd8_avx512f(__m512d); +IMPORT CONST __m512d Sleef_sqrtd8_u05avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_sqrtd8_u05avx512f(__m512d); +IMPORT CONST __m512d Sleef_sqrtd8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_sqrtd8_u35avx512f(__m512d); +IMPORT CONST __m512d Sleef_hypotd8_u05avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_hypotd8_u05avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_hypotd8_u35avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_hypotd8_u35avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_fabsd8_avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_fabsd8_avx512f(__m512d); +IMPORT CONST __m512d Sleef_copysignd8_avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_copysignd8_avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_fmaxd8_avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_fmaxd8_avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_fmind8_avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_fmind8_avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_fdimd8_avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_fdimd8_avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_truncd8_avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_truncd8_avx512f(__m512d); +IMPORT CONST __m512d Sleef_floord8_avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_floord8_avx512f(__m512d); +IMPORT CONST __m512d Sleef_ceild8_avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_ceild8_avx512f(__m512d); +IMPORT CONST __m512d Sleef_roundd8_avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_roundd8_avx512f(__m512d); +IMPORT CONST __m512d Sleef_rintd8_avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_rintd8_avx512f(__m512d); +IMPORT CONST __m512d Sleef_nextafterd8_avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_nextafterd8_avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_frfrexpd8_avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_frfrexpd8_avx512f(__m512d); +IMPORT CONST __m256i Sleef_expfrexpd8_avx512f(__m512d); +IMPORT CONST __m256i Sleef_finz_expfrexpd8_avx512f(__m512d); +IMPORT CONST __m512d Sleef_fmodd8_avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_fmodd8_avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_remainderd8_avx512f(__m512d, __m512d); +IMPORT CONST __m512d Sleef_finz_remainderd8_avx512f(__m512d, __m512d); +IMPORT CONST Sleef___m512d_2 Sleef_modfd8_avx512f(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_finz_modfd8_avx512f(__m512d); +IMPORT CONST __m512d Sleef_lgammad8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_lgammad8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_tgammad8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_tgammad8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_erfd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_erfd8_u10avx512f(__m512d); +IMPORT CONST __m512d Sleef_erfcd8_u15avx512f(__m512d); +IMPORT CONST __m512d Sleef_finz_erfcd8_u15avx512f(__m512d); +IMPORT CONST int Sleef_getIntd8_avx512f(int); +IMPORT CONST void *Sleef_getPtrd8_avx512f(int); + +#ifndef Sleef___m512_2_DEFINED +typedef struct { + __m512 x, y; +} Sleef___m512_2; +#define Sleef___m512_2_DEFINED +#endif + +IMPORT CONST __m512 Sleef_sinf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_sinf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_cosf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_cosf16_u35avx512f(__m512); +IMPORT CONST Sleef___m512_2 Sleef_sincosf16_u35avx512f(__m512); +IMPORT CONST Sleef___m512_2 Sleef_finz_sincosf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_tanf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_tanf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_asinf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_asinf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_acosf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_acosf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_atanf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_atanf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_atan2f16_u35avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_atan2f16_u35avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_logf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_logf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_cbrtf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_cbrtf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_sinf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_sinf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_cosf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_cosf16_u10avx512f(__m512); +IMPORT CONST Sleef___m512_2 Sleef_sincosf16_u10avx512f(__m512); +IMPORT CONST Sleef___m512_2 Sleef_finz_sincosf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_tanf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_tanf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_asinf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_asinf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_acosf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_acosf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_atanf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_atanf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_atan2f16_u10avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_atan2f16_u10avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_logf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_logf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_cbrtf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_cbrtf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_expf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_expf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_powf16_u10avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_powf16_u10avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_sinhf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_sinhf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_coshf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_coshf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_tanhf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_tanhf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_sinhf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_sinhf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_coshf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_coshf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_tanhf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_tanhf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_fastsinf16_u3500avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_fastsinf16_u3500avx512f(__m512); +IMPORT CONST __m512 Sleef_fastcosf16_u3500avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_fastcosf16_u3500avx512f(__m512); +IMPORT CONST __m512 Sleef_fastpowf16_u3500avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_fastpowf16_u3500avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_asinhf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_asinhf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_acoshf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_acoshf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_atanhf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_atanhf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_exp2f16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_exp2f16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_exp2f16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_exp2f16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_exp10f16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_exp10f16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_exp10f16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_exp10f16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_expm1f16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_expm1f16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_log10f16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_log10f16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_log2f16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_log2f16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_log2f16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_log2f16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_log1pf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_log1pf16_u10avx512f(__m512); +IMPORT CONST Sleef___m512_2 Sleef_sincospif16_u05avx512f(__m512); +IMPORT CONST Sleef___m512_2 Sleef_finz_sincospif16_u05avx512f(__m512); +IMPORT CONST Sleef___m512_2 Sleef_sincospif16_u35avx512f(__m512); +IMPORT CONST Sleef___m512_2 Sleef_finz_sincospif16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_sinpif16_u05avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_sinpif16_u05avx512f(__m512); +IMPORT CONST __m512 Sleef_cospif16_u05avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_cospif16_u05avx512f(__m512); +IMPORT CONST __m512 Sleef_fmaf16_avx512f(__m512, __m512, __m512); +IMPORT CONST __m512 Sleef_finz_fmaf16_avx512f(__m512, __m512, __m512); +IMPORT CONST __m512 Sleef_sqrtf16_avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_sqrtf16_avx512f(__m512); +IMPORT CONST __m512 Sleef_sqrtf16_u05avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_sqrtf16_u05avx512f(__m512); +IMPORT CONST __m512 Sleef_sqrtf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_sqrtf16_u35avx512f(__m512); +IMPORT CONST __m512 Sleef_hypotf16_u05avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_hypotf16_u05avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_hypotf16_u35avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_hypotf16_u35avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_fabsf16_avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_fabsf16_avx512f(__m512); +IMPORT CONST __m512 Sleef_copysignf16_avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_copysignf16_avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_fmaxf16_avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_fmaxf16_avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_fminf16_avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_fminf16_avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_fdimf16_avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_fdimf16_avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_truncf16_avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_truncf16_avx512f(__m512); +IMPORT CONST __m512 Sleef_floorf16_avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_floorf16_avx512f(__m512); +IMPORT CONST __m512 Sleef_ceilf16_avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_ceilf16_avx512f(__m512); +IMPORT CONST __m512 Sleef_roundf16_avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_roundf16_avx512f(__m512); +IMPORT CONST __m512 Sleef_rintf16_avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_rintf16_avx512f(__m512); +IMPORT CONST __m512 Sleef_nextafterf16_avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_nextafterf16_avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_frfrexpf16_avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_frfrexpf16_avx512f(__m512); +IMPORT CONST __m512 Sleef_fmodf16_avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_fmodf16_avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_remainderf16_avx512f(__m512, __m512); +IMPORT CONST __m512 Sleef_finz_remainderf16_avx512f(__m512, __m512); +IMPORT CONST Sleef___m512_2 Sleef_modff16_avx512f(__m512); +IMPORT CONST Sleef___m512_2 Sleef_finz_modff16_avx512f(__m512); +IMPORT CONST __m512 Sleef_lgammaf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_lgammaf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_tgammaf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_tgammaf16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_erff16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_erff16_u10avx512f(__m512); +IMPORT CONST __m512 Sleef_erfcf16_u15avx512f(__m512); +IMPORT CONST __m512 Sleef_finz_erfcf16_u15avx512f(__m512); +IMPORT CONST int Sleef_getIntf16_avx512f(int); +IMPORT CONST int Sleef_finz_getIntf16_avx512f(int); +IMPORT CONST void *Sleef_getPtrf16_avx512f(int); +IMPORT CONST void *Sleef_finz_getPtrf16_avx512f(int); +#endif +#ifdef __AVX512F__ + +#ifndef Sleef___m512d_2_DEFINED +typedef struct { + __m512d x, y; +} Sleef___m512d_2; +#define Sleef___m512d_2_DEFINED +#endif + +IMPORT CONST __m512d Sleef_sind8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_sind8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cosd8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_cosd8_u35avx512fnofma(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_sincosd8_u35avx512fnofma(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_cinz_sincosd8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_tand8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_tand8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_asind8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_asind8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_acosd8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_acosd8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_atand8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_atand8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_atan2d8_u35avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_cinz_atan2d8_u35avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_logd8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_logd8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cbrtd8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_cbrtd8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_sind8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_sind8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cosd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_cosd8_u10avx512fnofma(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_sincosd8_u10avx512fnofma(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_cinz_sincosd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_tand8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_tand8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_asind8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_asind8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_acosd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_acosd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_atand8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_atand8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_atan2d8_u10avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_cinz_atan2d8_u10avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_logd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_logd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cbrtd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_cbrtd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_expd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_expd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_powd8_u10avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_cinz_powd8_u10avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_sinhd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_sinhd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_coshd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_coshd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_tanhd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_tanhd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_sinhd8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_sinhd8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_coshd8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_coshd8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_tanhd8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_tanhd8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_fastsind8_u3500avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_fastsind8_u3500avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_fastcosd8_u3500avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_fastcosd8_u3500avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_fastpowd8_u3500avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_cinz_fastpowd8_u3500avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_asinhd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_asinhd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_acoshd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_acoshd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_atanhd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_atanhd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_exp2d8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_exp2d8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_exp2d8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_exp2d8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_exp10d8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_exp10d8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_exp10d8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_exp10d8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_expm1d8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_expm1d8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_log10d8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_log10d8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_log2d8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_log2d8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_log2d8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_log2d8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_log1pd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_log1pd8_u10avx512fnofma(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_sincospid8_u05avx512fnofma(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_cinz_sincospid8_u05avx512fnofma(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_sincospid8_u35avx512fnofma(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_cinz_sincospid8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_sinpid8_u05avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_sinpid8_u05avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cospid8_u05avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_cospid8_u05avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_ldexpd8_avx512fnofma(__m512d, __m256i); +IMPORT CONST __m512d Sleef_cinz_ldexpd8_avx512fnofma(__m512d, __m256i); +IMPORT CONST __m256i Sleef_ilogbd8_avx512fnofma(__m512d); +IMPORT CONST __m256i Sleef_cinz_ilogbd8_avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_fmad8_avx512fnofma(__m512d, __m512d, __m512d); +IMPORT CONST __m512d Sleef_cinz_fmad8_avx512fnofma(__m512d, __m512d, __m512d); +IMPORT CONST __m512d Sleef_sqrtd8_avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_sqrtd8_avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_sqrtd8_u05avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_sqrtd8_u05avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_sqrtd8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_sqrtd8_u35avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_hypotd8_u05avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_cinz_hypotd8_u05avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_hypotd8_u35avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_cinz_hypotd8_u35avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_fabsd8_avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_fabsd8_avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_copysignd8_avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_cinz_copysignd8_avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_fmaxd8_avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_cinz_fmaxd8_avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_fmind8_avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_cinz_fmind8_avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_fdimd8_avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_cinz_fdimd8_avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_truncd8_avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_truncd8_avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_floord8_avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_floord8_avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_ceild8_avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_ceild8_avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_roundd8_avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_roundd8_avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_rintd8_avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_rintd8_avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_nextafterd8_avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_cinz_nextafterd8_avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_frfrexpd8_avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_frfrexpd8_avx512fnofma(__m512d); +IMPORT CONST __m256i Sleef_expfrexpd8_avx512fnofma(__m512d); +IMPORT CONST __m256i Sleef_cinz_expfrexpd8_avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_fmodd8_avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_cinz_fmodd8_avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_remainderd8_avx512fnofma(__m512d, __m512d); +IMPORT CONST __m512d Sleef_cinz_remainderd8_avx512fnofma(__m512d, __m512d); +IMPORT CONST Sleef___m512d_2 Sleef_modfd8_avx512fnofma(__m512d); +IMPORT CONST Sleef___m512d_2 Sleef_cinz_modfd8_avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_lgammad8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_lgammad8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_tgammad8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_tgammad8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_erfd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_erfd8_u10avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_erfcd8_u15avx512fnofma(__m512d); +IMPORT CONST __m512d Sleef_cinz_erfcd8_u15avx512fnofma(__m512d); +IMPORT CONST int Sleef_getIntd8_avx512fnofma(int); +IMPORT CONST void *Sleef_getPtrd8_avx512fnofma(int); + +#ifndef Sleef___m512_2_DEFINED +typedef struct { + __m512 x, y; +} Sleef___m512_2; +#define Sleef___m512_2_DEFINED +#endif + +IMPORT CONST __m512 Sleef_sinf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_sinf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cosf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_cosf16_u35avx512fnofma(__m512); +IMPORT CONST Sleef___m512_2 Sleef_sincosf16_u35avx512fnofma(__m512); +IMPORT CONST Sleef___m512_2 Sleef_cinz_sincosf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_tanf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_tanf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_asinf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_asinf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_acosf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_acosf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_atanf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_atanf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_atan2f16_u35avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_cinz_atan2f16_u35avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_logf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_logf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cbrtf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_cbrtf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_sinf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_sinf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cosf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_cosf16_u10avx512fnofma(__m512); +IMPORT CONST Sleef___m512_2 Sleef_sincosf16_u10avx512fnofma(__m512); +IMPORT CONST Sleef___m512_2 Sleef_cinz_sincosf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_tanf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_tanf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_asinf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_asinf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_acosf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_acosf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_atanf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_atanf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_atan2f16_u10avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_cinz_atan2f16_u10avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_logf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_logf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cbrtf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_cbrtf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_expf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_expf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_powf16_u10avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_cinz_powf16_u10avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_sinhf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_sinhf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_coshf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_coshf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_tanhf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_tanhf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_sinhf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_sinhf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_coshf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_coshf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_tanhf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_tanhf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_fastsinf16_u3500avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_fastsinf16_u3500avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_fastcosf16_u3500avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_fastcosf16_u3500avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_fastpowf16_u3500avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_cinz_fastpowf16_u3500avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_asinhf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_asinhf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_acoshf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_acoshf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_atanhf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_atanhf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_exp2f16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_exp2f16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_exp2f16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_exp2f16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_exp10f16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_exp10f16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_exp10f16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_exp10f16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_expm1f16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_expm1f16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_log10f16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_log10f16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_log2f16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_log2f16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_log2f16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_log2f16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_log1pf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_log1pf16_u10avx512fnofma(__m512); +IMPORT CONST Sleef___m512_2 Sleef_sincospif16_u05avx512fnofma(__m512); +IMPORT CONST Sleef___m512_2 Sleef_cinz_sincospif16_u05avx512fnofma(__m512); +IMPORT CONST Sleef___m512_2 Sleef_sincospif16_u35avx512fnofma(__m512); +IMPORT CONST Sleef___m512_2 Sleef_cinz_sincospif16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_sinpif16_u05avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_sinpif16_u05avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cospif16_u05avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_cospif16_u05avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_fmaf16_avx512fnofma(__m512, __m512, __m512); +IMPORT CONST __m512 Sleef_cinz_fmaf16_avx512fnofma(__m512, __m512, __m512); +IMPORT CONST __m512 Sleef_sqrtf16_avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_sqrtf16_avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_sqrtf16_u05avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_sqrtf16_u05avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_sqrtf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_sqrtf16_u35avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_hypotf16_u05avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_cinz_hypotf16_u05avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_hypotf16_u35avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_cinz_hypotf16_u35avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_fabsf16_avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_fabsf16_avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_copysignf16_avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_cinz_copysignf16_avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_fmaxf16_avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_cinz_fmaxf16_avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_fminf16_avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_cinz_fminf16_avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_fdimf16_avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_cinz_fdimf16_avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_truncf16_avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_truncf16_avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_floorf16_avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_floorf16_avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_ceilf16_avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_ceilf16_avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_roundf16_avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_roundf16_avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_rintf16_avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_rintf16_avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_nextafterf16_avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_cinz_nextafterf16_avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_frfrexpf16_avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_frfrexpf16_avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_fmodf16_avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_cinz_fmodf16_avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_remainderf16_avx512fnofma(__m512, __m512); +IMPORT CONST __m512 Sleef_cinz_remainderf16_avx512fnofma(__m512, __m512); +IMPORT CONST Sleef___m512_2 Sleef_modff16_avx512fnofma(__m512); +IMPORT CONST Sleef___m512_2 Sleef_cinz_modff16_avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_lgammaf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_lgammaf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_tgammaf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_tgammaf16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_erff16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_erff16_u10avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_erfcf16_u15avx512fnofma(__m512); +IMPORT CONST __m512 Sleef_cinz_erfcf16_u15avx512fnofma(__m512); +IMPORT CONST int Sleef_getIntf16_avx512fnofma(int); +IMPORT CONST int Sleef_cinz_getIntf16_avx512fnofma(int); +IMPORT CONST void *Sleef_getPtrf16_avx512fnofma(int); +IMPORT CONST void *Sleef_cinz_getPtrf16_avx512fnofma(int); +#endif +#ifdef __STDC__ + +#ifndef Sleef_double_2_DEFINED +typedef struct { + double x, y; +} Sleef_double_2; +#define Sleef_double_2_DEFINED +#endif + +IMPORT CONST double Sleef_sind1_u35purec(double); +IMPORT CONST double Sleef_cinz_sind1_u35purec(double); +IMPORT CONST double Sleef_cosd1_u35purec(double); +IMPORT CONST double Sleef_cinz_cosd1_u35purec(double); +IMPORT CONST Sleef_double_2 Sleef_sincosd1_u35purec(double); +IMPORT CONST Sleef_double_2 Sleef_cinz_sincosd1_u35purec(double); +IMPORT CONST double Sleef_tand1_u35purec(double); +IMPORT CONST double Sleef_cinz_tand1_u35purec(double); +IMPORT CONST double Sleef_asind1_u35purec(double); +IMPORT CONST double Sleef_cinz_asind1_u35purec(double); +IMPORT CONST double Sleef_acosd1_u35purec(double); +IMPORT CONST double Sleef_cinz_acosd1_u35purec(double); +IMPORT CONST double Sleef_atand1_u35purec(double); +IMPORT CONST double Sleef_cinz_atand1_u35purec(double); +IMPORT CONST double Sleef_atan2d1_u35purec(double, double); +IMPORT CONST double Sleef_cinz_atan2d1_u35purec(double, double); +IMPORT CONST double Sleef_logd1_u35purec(double); +IMPORT CONST double Sleef_cinz_logd1_u35purec(double); +IMPORT CONST double Sleef_cbrtd1_u35purec(double); +IMPORT CONST double Sleef_cinz_cbrtd1_u35purec(double); +IMPORT CONST double Sleef_sind1_u10purec(double); +IMPORT CONST double Sleef_cinz_sind1_u10purec(double); +IMPORT CONST double Sleef_cosd1_u10purec(double); +IMPORT CONST double Sleef_cinz_cosd1_u10purec(double); +IMPORT CONST Sleef_double_2 Sleef_sincosd1_u10purec(double); +IMPORT CONST Sleef_double_2 Sleef_cinz_sincosd1_u10purec(double); +IMPORT CONST double Sleef_tand1_u10purec(double); +IMPORT CONST double Sleef_cinz_tand1_u10purec(double); +IMPORT CONST double Sleef_asind1_u10purec(double); +IMPORT CONST double Sleef_cinz_asind1_u10purec(double); +IMPORT CONST double Sleef_acosd1_u10purec(double); +IMPORT CONST double Sleef_cinz_acosd1_u10purec(double); +IMPORT CONST double Sleef_atand1_u10purec(double); +IMPORT CONST double Sleef_cinz_atand1_u10purec(double); +IMPORT CONST double Sleef_atan2d1_u10purec(double, double); +IMPORT CONST double Sleef_cinz_atan2d1_u10purec(double, double); +IMPORT CONST double Sleef_logd1_u10purec(double); +IMPORT CONST double Sleef_cinz_logd1_u10purec(double); +IMPORT CONST double Sleef_cbrtd1_u10purec(double); +IMPORT CONST double Sleef_cinz_cbrtd1_u10purec(double); +IMPORT CONST double Sleef_expd1_u10purec(double); +IMPORT CONST double Sleef_cinz_expd1_u10purec(double); +IMPORT CONST double Sleef_powd1_u10purec(double, double); +IMPORT CONST double Sleef_cinz_powd1_u10purec(double, double); +IMPORT CONST double Sleef_sinhd1_u10purec(double); +IMPORT CONST double Sleef_cinz_sinhd1_u10purec(double); +IMPORT CONST double Sleef_coshd1_u10purec(double); +IMPORT CONST double Sleef_cinz_coshd1_u10purec(double); +IMPORT CONST double Sleef_tanhd1_u10purec(double); +IMPORT CONST double Sleef_cinz_tanhd1_u10purec(double); +IMPORT CONST double Sleef_sinhd1_u35purec(double); +IMPORT CONST double Sleef_cinz_sinhd1_u35purec(double); +IMPORT CONST double Sleef_coshd1_u35purec(double); +IMPORT CONST double Sleef_cinz_coshd1_u35purec(double); +IMPORT CONST double Sleef_tanhd1_u35purec(double); +IMPORT CONST double Sleef_cinz_tanhd1_u35purec(double); +IMPORT CONST double Sleef_fastsind1_u3500purec(double); +IMPORT CONST double Sleef_cinz_fastsind1_u3500purec(double); +IMPORT CONST double Sleef_fastcosd1_u3500purec(double); +IMPORT CONST double Sleef_cinz_fastcosd1_u3500purec(double); +IMPORT CONST double Sleef_fastpowd1_u3500purec(double, double); +IMPORT CONST double Sleef_cinz_fastpowd1_u3500purec(double, double); +IMPORT CONST double Sleef_asinhd1_u10purec(double); +IMPORT CONST double Sleef_cinz_asinhd1_u10purec(double); +IMPORT CONST double Sleef_acoshd1_u10purec(double); +IMPORT CONST double Sleef_cinz_acoshd1_u10purec(double); +IMPORT CONST double Sleef_atanhd1_u10purec(double); +IMPORT CONST double Sleef_cinz_atanhd1_u10purec(double); +IMPORT CONST double Sleef_exp2d1_u10purec(double); +IMPORT CONST double Sleef_cinz_exp2d1_u10purec(double); +IMPORT CONST double Sleef_exp2d1_u35purec(double); +IMPORT CONST double Sleef_cinz_exp2d1_u35purec(double); +IMPORT CONST double Sleef_exp10d1_u10purec(double); +IMPORT CONST double Sleef_cinz_exp10d1_u10purec(double); +IMPORT CONST double Sleef_exp10d1_u35purec(double); +IMPORT CONST double Sleef_cinz_exp10d1_u35purec(double); +IMPORT CONST double Sleef_expm1d1_u10purec(double); +IMPORT CONST double Sleef_cinz_expm1d1_u10purec(double); +IMPORT CONST double Sleef_log10d1_u10purec(double); +IMPORT CONST double Sleef_cinz_log10d1_u10purec(double); +IMPORT CONST double Sleef_log2d1_u10purec(double); +IMPORT CONST double Sleef_cinz_log2d1_u10purec(double); +IMPORT CONST double Sleef_log2d1_u35purec(double); +IMPORT CONST double Sleef_cinz_log2d1_u35purec(double); +IMPORT CONST double Sleef_log1pd1_u10purec(double); +IMPORT CONST double Sleef_cinz_log1pd1_u10purec(double); +IMPORT CONST Sleef_double_2 Sleef_sincospid1_u05purec(double); +IMPORT CONST Sleef_double_2 Sleef_cinz_sincospid1_u05purec(double); +IMPORT CONST Sleef_double_2 Sleef_sincospid1_u35purec(double); +IMPORT CONST Sleef_double_2 Sleef_cinz_sincospid1_u35purec(double); +IMPORT CONST double Sleef_sinpid1_u05purec(double); +IMPORT CONST double Sleef_cinz_sinpid1_u05purec(double); +IMPORT CONST double Sleef_cospid1_u05purec(double); +IMPORT CONST double Sleef_cinz_cospid1_u05purec(double); +IMPORT CONST double Sleef_ldexpd1_purec(double, int32_t); +IMPORT CONST double Sleef_cinz_ldexpd1_purec(double, int32_t); +IMPORT CONST int32_t Sleef_ilogbd1_purec(double); +IMPORT CONST int32_t Sleef_cinz_ilogbd1_purec(double); +IMPORT CONST double Sleef_fmad1_purec(double, double, double); +IMPORT CONST double Sleef_cinz_fmad1_purec(double, double, double); +IMPORT CONST double Sleef_sqrtd1_purec(double); +IMPORT CONST double Sleef_cinz_sqrtd1_purec(double); +IMPORT CONST double Sleef_sqrtd1_u05purec(double); +IMPORT CONST double Sleef_cinz_sqrtd1_u05purec(double); +IMPORT CONST double Sleef_sqrtd1_u35purec(double); +IMPORT CONST double Sleef_cinz_sqrtd1_u35purec(double); +IMPORT CONST double Sleef_hypotd1_u05purec(double, double); +IMPORT CONST double Sleef_cinz_hypotd1_u05purec(double, double); +IMPORT CONST double Sleef_hypotd1_u35purec(double, double); +IMPORT CONST double Sleef_cinz_hypotd1_u35purec(double, double); +IMPORT CONST double Sleef_fabsd1_purec(double); +IMPORT CONST double Sleef_cinz_fabsd1_purec(double); +IMPORT CONST double Sleef_copysignd1_purec(double, double); +IMPORT CONST double Sleef_cinz_copysignd1_purec(double, double); +IMPORT CONST double Sleef_fmaxd1_purec(double, double); +IMPORT CONST double Sleef_cinz_fmaxd1_purec(double, double); +IMPORT CONST double Sleef_fmind1_purec(double, double); +IMPORT CONST double Sleef_cinz_fmind1_purec(double, double); +IMPORT CONST double Sleef_fdimd1_purec(double, double); +IMPORT CONST double Sleef_cinz_fdimd1_purec(double, double); +IMPORT CONST double Sleef_truncd1_purec(double); +IMPORT CONST double Sleef_cinz_truncd1_purec(double); +IMPORT CONST double Sleef_floord1_purec(double); +IMPORT CONST double Sleef_cinz_floord1_purec(double); +IMPORT CONST double Sleef_ceild1_purec(double); +IMPORT CONST double Sleef_cinz_ceild1_purec(double); +IMPORT CONST double Sleef_roundd1_purec(double); +IMPORT CONST double Sleef_cinz_roundd1_purec(double); +IMPORT CONST double Sleef_rintd1_purec(double); +IMPORT CONST double Sleef_cinz_rintd1_purec(double); +IMPORT CONST double Sleef_nextafterd1_purec(double, double); +IMPORT CONST double Sleef_cinz_nextafterd1_purec(double, double); +IMPORT CONST double Sleef_frfrexpd1_purec(double); +IMPORT CONST double Sleef_cinz_frfrexpd1_purec(double); +IMPORT CONST int32_t Sleef_expfrexpd1_purec(double); +IMPORT CONST int32_t Sleef_cinz_expfrexpd1_purec(double); +IMPORT CONST double Sleef_fmodd1_purec(double, double); +IMPORT CONST double Sleef_cinz_fmodd1_purec(double, double); +IMPORT CONST double Sleef_remainderd1_purec(double, double); +IMPORT CONST double Sleef_cinz_remainderd1_purec(double, double); +IMPORT CONST Sleef_double_2 Sleef_modfd1_purec(double); +IMPORT CONST Sleef_double_2 Sleef_cinz_modfd1_purec(double); +IMPORT CONST double Sleef_lgammad1_u10purec(double); +IMPORT CONST double Sleef_cinz_lgammad1_u10purec(double); +IMPORT CONST double Sleef_tgammad1_u10purec(double); +IMPORT CONST double Sleef_cinz_tgammad1_u10purec(double); +IMPORT CONST double Sleef_erfd1_u10purec(double); +IMPORT CONST double Sleef_cinz_erfd1_u10purec(double); +IMPORT CONST double Sleef_erfcd1_u15purec(double); +IMPORT CONST double Sleef_cinz_erfcd1_u15purec(double); +IMPORT CONST int Sleef_getIntd1_purec(int); +IMPORT CONST void *Sleef_getPtrd1_purec(int); + +#ifndef Sleef_float_2_DEFINED +typedef struct { + float x, y; +} Sleef_float_2; +#define Sleef_float_2_DEFINED +#endif + +IMPORT CONST float Sleef_sinf1_u35purec(float); +IMPORT CONST float Sleef_cinz_sinf1_u35purec(float); +IMPORT CONST float Sleef_cosf1_u35purec(float); +IMPORT CONST float Sleef_cinz_cosf1_u35purec(float); +IMPORT CONST Sleef_float_2 Sleef_sincosf1_u35purec(float); +IMPORT CONST Sleef_float_2 Sleef_cinz_sincosf1_u35purec(float); +IMPORT CONST float Sleef_tanf1_u35purec(float); +IMPORT CONST float Sleef_cinz_tanf1_u35purec(float); +IMPORT CONST float Sleef_asinf1_u35purec(float); +IMPORT CONST float Sleef_cinz_asinf1_u35purec(float); +IMPORT CONST float Sleef_acosf1_u35purec(float); +IMPORT CONST float Sleef_cinz_acosf1_u35purec(float); +IMPORT CONST float Sleef_atanf1_u35purec(float); +IMPORT CONST float Sleef_cinz_atanf1_u35purec(float); +IMPORT CONST float Sleef_atan2f1_u35purec(float, float); +IMPORT CONST float Sleef_cinz_atan2f1_u35purec(float, float); +IMPORT CONST float Sleef_logf1_u35purec(float); +IMPORT CONST float Sleef_cinz_logf1_u35purec(float); +IMPORT CONST float Sleef_cbrtf1_u35purec(float); +IMPORT CONST float Sleef_cinz_cbrtf1_u35purec(float); +IMPORT CONST float Sleef_sinf1_u10purec(float); +IMPORT CONST float Sleef_cinz_sinf1_u10purec(float); +IMPORT CONST float Sleef_cosf1_u10purec(float); +IMPORT CONST float Sleef_cinz_cosf1_u10purec(float); +IMPORT CONST Sleef_float_2 Sleef_sincosf1_u10purec(float); +IMPORT CONST Sleef_float_2 Sleef_cinz_sincosf1_u10purec(float); +IMPORT CONST float Sleef_tanf1_u10purec(float); +IMPORT CONST float Sleef_cinz_tanf1_u10purec(float); +IMPORT CONST float Sleef_asinf1_u10purec(float); +IMPORT CONST float Sleef_cinz_asinf1_u10purec(float); +IMPORT CONST float Sleef_acosf1_u10purec(float); +IMPORT CONST float Sleef_cinz_acosf1_u10purec(float); +IMPORT CONST float Sleef_atanf1_u10purec(float); +IMPORT CONST float Sleef_cinz_atanf1_u10purec(float); +IMPORT CONST float Sleef_atan2f1_u10purec(float, float); +IMPORT CONST float Sleef_cinz_atan2f1_u10purec(float, float); +IMPORT CONST float Sleef_logf1_u10purec(float); +IMPORT CONST float Sleef_cinz_logf1_u10purec(float); +IMPORT CONST float Sleef_cbrtf1_u10purec(float); +IMPORT CONST float Sleef_cinz_cbrtf1_u10purec(float); +IMPORT CONST float Sleef_expf1_u10purec(float); +IMPORT CONST float Sleef_cinz_expf1_u10purec(float); +IMPORT CONST float Sleef_powf1_u10purec(float, float); +IMPORT CONST float Sleef_cinz_powf1_u10purec(float, float); +IMPORT CONST float Sleef_sinhf1_u10purec(float); +IMPORT CONST float Sleef_cinz_sinhf1_u10purec(float); +IMPORT CONST float Sleef_coshf1_u10purec(float); +IMPORT CONST float Sleef_cinz_coshf1_u10purec(float); +IMPORT CONST float Sleef_tanhf1_u10purec(float); +IMPORT CONST float Sleef_cinz_tanhf1_u10purec(float); +IMPORT CONST float Sleef_sinhf1_u35purec(float); +IMPORT CONST float Sleef_cinz_sinhf1_u35purec(float); +IMPORT CONST float Sleef_coshf1_u35purec(float); +IMPORT CONST float Sleef_cinz_coshf1_u35purec(float); +IMPORT CONST float Sleef_tanhf1_u35purec(float); +IMPORT CONST float Sleef_cinz_tanhf1_u35purec(float); +IMPORT CONST float Sleef_fastsinf1_u3500purec(float); +IMPORT CONST float Sleef_cinz_fastsinf1_u3500purec(float); +IMPORT CONST float Sleef_fastcosf1_u3500purec(float); +IMPORT CONST float Sleef_cinz_fastcosf1_u3500purec(float); +IMPORT CONST float Sleef_fastpowf1_u3500purec(float, float); +IMPORT CONST float Sleef_cinz_fastpowf1_u3500purec(float, float); +IMPORT CONST float Sleef_asinhf1_u10purec(float); +IMPORT CONST float Sleef_cinz_asinhf1_u10purec(float); +IMPORT CONST float Sleef_acoshf1_u10purec(float); +IMPORT CONST float Sleef_cinz_acoshf1_u10purec(float); +IMPORT CONST float Sleef_atanhf1_u10purec(float); +IMPORT CONST float Sleef_cinz_atanhf1_u10purec(float); +IMPORT CONST float Sleef_exp2f1_u10purec(float); +IMPORT CONST float Sleef_cinz_exp2f1_u10purec(float); +IMPORT CONST float Sleef_exp2f1_u35purec(float); +IMPORT CONST float Sleef_cinz_exp2f1_u35purec(float); +IMPORT CONST float Sleef_exp10f1_u10purec(float); +IMPORT CONST float Sleef_cinz_exp10f1_u10purec(float); +IMPORT CONST float Sleef_exp10f1_u35purec(float); +IMPORT CONST float Sleef_cinz_exp10f1_u35purec(float); +IMPORT CONST float Sleef_expm1f1_u10purec(float); +IMPORT CONST float Sleef_cinz_expm1f1_u10purec(float); +IMPORT CONST float Sleef_log10f1_u10purec(float); +IMPORT CONST float Sleef_cinz_log10f1_u10purec(float); +IMPORT CONST float Sleef_log2f1_u10purec(float); +IMPORT CONST float Sleef_cinz_log2f1_u10purec(float); +IMPORT CONST float Sleef_log2f1_u35purec(float); +IMPORT CONST float Sleef_cinz_log2f1_u35purec(float); +IMPORT CONST float Sleef_log1pf1_u10purec(float); +IMPORT CONST float Sleef_cinz_log1pf1_u10purec(float); +IMPORT CONST Sleef_float_2 Sleef_sincospif1_u05purec(float); +IMPORT CONST Sleef_float_2 Sleef_cinz_sincospif1_u05purec(float); +IMPORT CONST Sleef_float_2 Sleef_sincospif1_u35purec(float); +IMPORT CONST Sleef_float_2 Sleef_cinz_sincospif1_u35purec(float); +IMPORT CONST float Sleef_sinpif1_u05purec(float); +IMPORT CONST float Sleef_cinz_sinpif1_u05purec(float); +IMPORT CONST float Sleef_cospif1_u05purec(float); +IMPORT CONST float Sleef_cinz_cospif1_u05purec(float); +IMPORT CONST float Sleef_fmaf1_purec(float, float, float); +IMPORT CONST float Sleef_cinz_fmaf1_purec(float, float, float); +IMPORT CONST float Sleef_sqrtf1_purec(float); +IMPORT CONST float Sleef_cinz_sqrtf1_purec(float); +IMPORT CONST float Sleef_sqrtf1_u05purec(float); +IMPORT CONST float Sleef_cinz_sqrtf1_u05purec(float); +IMPORT CONST float Sleef_sqrtf1_u35purec(float); +IMPORT CONST float Sleef_cinz_sqrtf1_u35purec(float); +IMPORT CONST float Sleef_hypotf1_u05purec(float, float); +IMPORT CONST float Sleef_cinz_hypotf1_u05purec(float, float); +IMPORT CONST float Sleef_hypotf1_u35purec(float, float); +IMPORT CONST float Sleef_cinz_hypotf1_u35purec(float, float); +IMPORT CONST float Sleef_fabsf1_purec(float); +IMPORT CONST float Sleef_cinz_fabsf1_purec(float); +IMPORT CONST float Sleef_copysignf1_purec(float, float); +IMPORT CONST float Sleef_cinz_copysignf1_purec(float, float); +IMPORT CONST float Sleef_fmaxf1_purec(float, float); +IMPORT CONST float Sleef_cinz_fmaxf1_purec(float, float); +IMPORT CONST float Sleef_fminf1_purec(float, float); +IMPORT CONST float Sleef_cinz_fminf1_purec(float, float); +IMPORT CONST float Sleef_fdimf1_purec(float, float); +IMPORT CONST float Sleef_cinz_fdimf1_purec(float, float); +IMPORT CONST float Sleef_truncf1_purec(float); +IMPORT CONST float Sleef_cinz_truncf1_purec(float); +IMPORT CONST float Sleef_floorf1_purec(float); +IMPORT CONST float Sleef_cinz_floorf1_purec(float); +IMPORT CONST float Sleef_ceilf1_purec(float); +IMPORT CONST float Sleef_cinz_ceilf1_purec(float); +IMPORT CONST float Sleef_roundf1_purec(float); +IMPORT CONST float Sleef_cinz_roundf1_purec(float); +IMPORT CONST float Sleef_rintf1_purec(float); +IMPORT CONST float Sleef_cinz_rintf1_purec(float); +IMPORT CONST float Sleef_nextafterf1_purec(float, float); +IMPORT CONST float Sleef_cinz_nextafterf1_purec(float, float); +IMPORT CONST float Sleef_frfrexpf1_purec(float); +IMPORT CONST float Sleef_cinz_frfrexpf1_purec(float); +IMPORT CONST float Sleef_fmodf1_purec(float, float); +IMPORT CONST float Sleef_cinz_fmodf1_purec(float, float); +IMPORT CONST float Sleef_remainderf1_purec(float, float); +IMPORT CONST float Sleef_cinz_remainderf1_purec(float, float); +IMPORT CONST Sleef_float_2 Sleef_modff1_purec(float); +IMPORT CONST Sleef_float_2 Sleef_cinz_modff1_purec(float); +IMPORT CONST float Sleef_lgammaf1_u10purec(float); +IMPORT CONST float Sleef_cinz_lgammaf1_u10purec(float); +IMPORT CONST float Sleef_tgammaf1_u10purec(float); +IMPORT CONST float Sleef_cinz_tgammaf1_u10purec(float); +IMPORT CONST float Sleef_erff1_u10purec(float); +IMPORT CONST float Sleef_cinz_erff1_u10purec(float); +IMPORT CONST float Sleef_erfcf1_u15purec(float); +IMPORT CONST float Sleef_cinz_erfcf1_u15purec(float); +IMPORT CONST int Sleef_getIntf1_purec(int); +IMPORT CONST int Sleef_cinz_getIntf1_purec(int); +IMPORT CONST void *Sleef_getPtrf1_purec(int); +IMPORT CONST void *Sleef_cinz_getPtrf1_purec(int); +#endif +#ifdef FP_FAST_FMA + +#ifndef Sleef_double_2_DEFINED +typedef struct { + double x, y; +} Sleef_double_2; +#define Sleef_double_2_DEFINED +#endif + +IMPORT CONST double Sleef_sind1_u35purecfma(double); +IMPORT CONST double Sleef_finz_sind1_u35purecfma(double); +IMPORT CONST double Sleef_cosd1_u35purecfma(double); +IMPORT CONST double Sleef_finz_cosd1_u35purecfma(double); +IMPORT CONST Sleef_double_2 Sleef_sincosd1_u35purecfma(double); +IMPORT CONST Sleef_double_2 Sleef_finz_sincosd1_u35purecfma(double); +IMPORT CONST double Sleef_tand1_u35purecfma(double); +IMPORT CONST double Sleef_finz_tand1_u35purecfma(double); +IMPORT CONST double Sleef_asind1_u35purecfma(double); +IMPORT CONST double Sleef_finz_asind1_u35purecfma(double); +IMPORT CONST double Sleef_acosd1_u35purecfma(double); +IMPORT CONST double Sleef_finz_acosd1_u35purecfma(double); +IMPORT CONST double Sleef_atand1_u35purecfma(double); +IMPORT CONST double Sleef_finz_atand1_u35purecfma(double); +IMPORT CONST double Sleef_atan2d1_u35purecfma(double, double); +IMPORT CONST double Sleef_finz_atan2d1_u35purecfma(double, double); +IMPORT CONST double Sleef_logd1_u35purecfma(double); +IMPORT CONST double Sleef_finz_logd1_u35purecfma(double); +IMPORT CONST double Sleef_cbrtd1_u35purecfma(double); +IMPORT CONST double Sleef_finz_cbrtd1_u35purecfma(double); +IMPORT CONST double Sleef_sind1_u10purecfma(double); +IMPORT CONST double Sleef_finz_sind1_u10purecfma(double); +IMPORT CONST double Sleef_cosd1_u10purecfma(double); +IMPORT CONST double Sleef_finz_cosd1_u10purecfma(double); +IMPORT CONST Sleef_double_2 Sleef_sincosd1_u10purecfma(double); +IMPORT CONST Sleef_double_2 Sleef_finz_sincosd1_u10purecfma(double); +IMPORT CONST double Sleef_tand1_u10purecfma(double); +IMPORT CONST double Sleef_finz_tand1_u10purecfma(double); +IMPORT CONST double Sleef_asind1_u10purecfma(double); +IMPORT CONST double Sleef_finz_asind1_u10purecfma(double); +IMPORT CONST double Sleef_acosd1_u10purecfma(double); +IMPORT CONST double Sleef_finz_acosd1_u10purecfma(double); +IMPORT CONST double Sleef_atand1_u10purecfma(double); +IMPORT CONST double Sleef_finz_atand1_u10purecfma(double); +IMPORT CONST double Sleef_atan2d1_u10purecfma(double, double); +IMPORT CONST double Sleef_finz_atan2d1_u10purecfma(double, double); +IMPORT CONST double Sleef_logd1_u10purecfma(double); +IMPORT CONST double Sleef_finz_logd1_u10purecfma(double); +IMPORT CONST double Sleef_cbrtd1_u10purecfma(double); +IMPORT CONST double Sleef_finz_cbrtd1_u10purecfma(double); +IMPORT CONST double Sleef_expd1_u10purecfma(double); +IMPORT CONST double Sleef_finz_expd1_u10purecfma(double); +IMPORT CONST double Sleef_powd1_u10purecfma(double, double); +IMPORT CONST double Sleef_finz_powd1_u10purecfma(double, double); +IMPORT CONST double Sleef_sinhd1_u10purecfma(double); +IMPORT CONST double Sleef_finz_sinhd1_u10purecfma(double); +IMPORT CONST double Sleef_coshd1_u10purecfma(double); +IMPORT CONST double Sleef_finz_coshd1_u10purecfma(double); +IMPORT CONST double Sleef_tanhd1_u10purecfma(double); +IMPORT CONST double Sleef_finz_tanhd1_u10purecfma(double); +IMPORT CONST double Sleef_sinhd1_u35purecfma(double); +IMPORT CONST double Sleef_finz_sinhd1_u35purecfma(double); +IMPORT CONST double Sleef_coshd1_u35purecfma(double); +IMPORT CONST double Sleef_finz_coshd1_u35purecfma(double); +IMPORT CONST double Sleef_tanhd1_u35purecfma(double); +IMPORT CONST double Sleef_finz_tanhd1_u35purecfma(double); +IMPORT CONST double Sleef_fastsind1_u3500purecfma(double); +IMPORT CONST double Sleef_finz_fastsind1_u3500purecfma(double); +IMPORT CONST double Sleef_fastcosd1_u3500purecfma(double); +IMPORT CONST double Sleef_finz_fastcosd1_u3500purecfma(double); +IMPORT CONST double Sleef_fastpowd1_u3500purecfma(double, double); +IMPORT CONST double Sleef_finz_fastpowd1_u3500purecfma(double, double); +IMPORT CONST double Sleef_asinhd1_u10purecfma(double); +IMPORT CONST double Sleef_finz_asinhd1_u10purecfma(double); +IMPORT CONST double Sleef_acoshd1_u10purecfma(double); +IMPORT CONST double Sleef_finz_acoshd1_u10purecfma(double); +IMPORT CONST double Sleef_atanhd1_u10purecfma(double); +IMPORT CONST double Sleef_finz_atanhd1_u10purecfma(double); +IMPORT CONST double Sleef_exp2d1_u10purecfma(double); +IMPORT CONST double Sleef_finz_exp2d1_u10purecfma(double); +IMPORT CONST double Sleef_exp2d1_u35purecfma(double); +IMPORT CONST double Sleef_finz_exp2d1_u35purecfma(double); +IMPORT CONST double Sleef_exp10d1_u10purecfma(double); +IMPORT CONST double Sleef_finz_exp10d1_u10purecfma(double); +IMPORT CONST double Sleef_exp10d1_u35purecfma(double); +IMPORT CONST double Sleef_finz_exp10d1_u35purecfma(double); +IMPORT CONST double Sleef_expm1d1_u10purecfma(double); +IMPORT CONST double Sleef_finz_expm1d1_u10purecfma(double); +IMPORT CONST double Sleef_log10d1_u10purecfma(double); +IMPORT CONST double Sleef_finz_log10d1_u10purecfma(double); +IMPORT CONST double Sleef_log2d1_u10purecfma(double); +IMPORT CONST double Sleef_finz_log2d1_u10purecfma(double); +IMPORT CONST double Sleef_log2d1_u35purecfma(double); +IMPORT CONST double Sleef_finz_log2d1_u35purecfma(double); +IMPORT CONST double Sleef_log1pd1_u10purecfma(double); +IMPORT CONST double Sleef_finz_log1pd1_u10purecfma(double); +IMPORT CONST Sleef_double_2 Sleef_sincospid1_u05purecfma(double); +IMPORT CONST Sleef_double_2 Sleef_finz_sincospid1_u05purecfma(double); +IMPORT CONST Sleef_double_2 Sleef_sincospid1_u35purecfma(double); +IMPORT CONST Sleef_double_2 Sleef_finz_sincospid1_u35purecfma(double); +IMPORT CONST double Sleef_sinpid1_u05purecfma(double); +IMPORT CONST double Sleef_finz_sinpid1_u05purecfma(double); +IMPORT CONST double Sleef_cospid1_u05purecfma(double); +IMPORT CONST double Sleef_finz_cospid1_u05purecfma(double); +IMPORT CONST double Sleef_ldexpd1_purecfma(double, int32_t); +IMPORT CONST double Sleef_finz_ldexpd1_purecfma(double, int32_t); +IMPORT CONST int32_t Sleef_ilogbd1_purecfma(double); +IMPORT CONST int32_t Sleef_finz_ilogbd1_purecfma(double); +IMPORT CONST double Sleef_fmad1_purecfma(double, double, double); +IMPORT CONST double Sleef_finz_fmad1_purecfma(double, double, double); +IMPORT CONST double Sleef_sqrtd1_purecfma(double); +IMPORT CONST double Sleef_finz_sqrtd1_purecfma(double); +IMPORT CONST double Sleef_sqrtd1_u05purecfma(double); +IMPORT CONST double Sleef_finz_sqrtd1_u05purecfma(double); +IMPORT CONST double Sleef_sqrtd1_u35purecfma(double); +IMPORT CONST double Sleef_finz_sqrtd1_u35purecfma(double); +IMPORT CONST double Sleef_hypotd1_u05purecfma(double, double); +IMPORT CONST double Sleef_finz_hypotd1_u05purecfma(double, double); +IMPORT CONST double Sleef_hypotd1_u35purecfma(double, double); +IMPORT CONST double Sleef_finz_hypotd1_u35purecfma(double, double); +IMPORT CONST double Sleef_fabsd1_purecfma(double); +IMPORT CONST double Sleef_finz_fabsd1_purecfma(double); +IMPORT CONST double Sleef_copysignd1_purecfma(double, double); +IMPORT CONST double Sleef_finz_copysignd1_purecfma(double, double); +IMPORT CONST double Sleef_fmaxd1_purecfma(double, double); +IMPORT CONST double Sleef_finz_fmaxd1_purecfma(double, double); +IMPORT CONST double Sleef_fmind1_purecfma(double, double); +IMPORT CONST double Sleef_finz_fmind1_purecfma(double, double); +IMPORT CONST double Sleef_fdimd1_purecfma(double, double); +IMPORT CONST double Sleef_finz_fdimd1_purecfma(double, double); +IMPORT CONST double Sleef_truncd1_purecfma(double); +IMPORT CONST double Sleef_finz_truncd1_purecfma(double); +IMPORT CONST double Sleef_floord1_purecfma(double); +IMPORT CONST double Sleef_finz_floord1_purecfma(double); +IMPORT CONST double Sleef_ceild1_purecfma(double); +IMPORT CONST double Sleef_finz_ceild1_purecfma(double); +IMPORT CONST double Sleef_roundd1_purecfma(double); +IMPORT CONST double Sleef_finz_roundd1_purecfma(double); +IMPORT CONST double Sleef_rintd1_purecfma(double); +IMPORT CONST double Sleef_finz_rintd1_purecfma(double); +IMPORT CONST double Sleef_nextafterd1_purecfma(double, double); +IMPORT CONST double Sleef_finz_nextafterd1_purecfma(double, double); +IMPORT CONST double Sleef_frfrexpd1_purecfma(double); +IMPORT CONST double Sleef_finz_frfrexpd1_purecfma(double); +IMPORT CONST int32_t Sleef_expfrexpd1_purecfma(double); +IMPORT CONST int32_t Sleef_finz_expfrexpd1_purecfma(double); +IMPORT CONST double Sleef_fmodd1_purecfma(double, double); +IMPORT CONST double Sleef_finz_fmodd1_purecfma(double, double); +IMPORT CONST double Sleef_remainderd1_purecfma(double, double); +IMPORT CONST double Sleef_finz_remainderd1_purecfma(double, double); +IMPORT CONST Sleef_double_2 Sleef_modfd1_purecfma(double); +IMPORT CONST Sleef_double_2 Sleef_finz_modfd1_purecfma(double); +IMPORT CONST double Sleef_lgammad1_u10purecfma(double); +IMPORT CONST double Sleef_finz_lgammad1_u10purecfma(double); +IMPORT CONST double Sleef_tgammad1_u10purecfma(double); +IMPORT CONST double Sleef_finz_tgammad1_u10purecfma(double); +IMPORT CONST double Sleef_erfd1_u10purecfma(double); +IMPORT CONST double Sleef_finz_erfd1_u10purecfma(double); +IMPORT CONST double Sleef_erfcd1_u15purecfma(double); +IMPORT CONST double Sleef_finz_erfcd1_u15purecfma(double); +IMPORT CONST int Sleef_getIntd1_purecfma(int); +IMPORT CONST void *Sleef_getPtrd1_purecfma(int); + +#ifndef Sleef_float_2_DEFINED +typedef struct { + float x, y; +} Sleef_float_2; +#define Sleef_float_2_DEFINED +#endif + +IMPORT CONST float Sleef_sinf1_u35purecfma(float); +IMPORT CONST float Sleef_finz_sinf1_u35purecfma(float); +IMPORT CONST float Sleef_cosf1_u35purecfma(float); +IMPORT CONST float Sleef_finz_cosf1_u35purecfma(float); +IMPORT CONST Sleef_float_2 Sleef_sincosf1_u35purecfma(float); +IMPORT CONST Sleef_float_2 Sleef_finz_sincosf1_u35purecfma(float); +IMPORT CONST float Sleef_tanf1_u35purecfma(float); +IMPORT CONST float Sleef_finz_tanf1_u35purecfma(float); +IMPORT CONST float Sleef_asinf1_u35purecfma(float); +IMPORT CONST float Sleef_finz_asinf1_u35purecfma(float); +IMPORT CONST float Sleef_acosf1_u35purecfma(float); +IMPORT CONST float Sleef_finz_acosf1_u35purecfma(float); +IMPORT CONST float Sleef_atanf1_u35purecfma(float); +IMPORT CONST float Sleef_finz_atanf1_u35purecfma(float); +IMPORT CONST float Sleef_atan2f1_u35purecfma(float, float); +IMPORT CONST float Sleef_finz_atan2f1_u35purecfma(float, float); +IMPORT CONST float Sleef_logf1_u35purecfma(float); +IMPORT CONST float Sleef_finz_logf1_u35purecfma(float); +IMPORT CONST float Sleef_cbrtf1_u35purecfma(float); +IMPORT CONST float Sleef_finz_cbrtf1_u35purecfma(float); +IMPORT CONST float Sleef_sinf1_u10purecfma(float); +IMPORT CONST float Sleef_finz_sinf1_u10purecfma(float); +IMPORT CONST float Sleef_cosf1_u10purecfma(float); +IMPORT CONST float Sleef_finz_cosf1_u10purecfma(float); +IMPORT CONST Sleef_float_2 Sleef_sincosf1_u10purecfma(float); +IMPORT CONST Sleef_float_2 Sleef_finz_sincosf1_u10purecfma(float); +IMPORT CONST float Sleef_tanf1_u10purecfma(float); +IMPORT CONST float Sleef_finz_tanf1_u10purecfma(float); +IMPORT CONST float Sleef_asinf1_u10purecfma(float); +IMPORT CONST float Sleef_finz_asinf1_u10purecfma(float); +IMPORT CONST float Sleef_acosf1_u10purecfma(float); +IMPORT CONST float Sleef_finz_acosf1_u10purecfma(float); +IMPORT CONST float Sleef_atanf1_u10purecfma(float); +IMPORT CONST float Sleef_finz_atanf1_u10purecfma(float); +IMPORT CONST float Sleef_atan2f1_u10purecfma(float, float); +IMPORT CONST float Sleef_finz_atan2f1_u10purecfma(float, float); +IMPORT CONST float Sleef_logf1_u10purecfma(float); +IMPORT CONST float Sleef_finz_logf1_u10purecfma(float); +IMPORT CONST float Sleef_cbrtf1_u10purecfma(float); +IMPORT CONST float Sleef_finz_cbrtf1_u10purecfma(float); +IMPORT CONST float Sleef_expf1_u10purecfma(float); +IMPORT CONST float Sleef_finz_expf1_u10purecfma(float); +IMPORT CONST float Sleef_powf1_u10purecfma(float, float); +IMPORT CONST float Sleef_finz_powf1_u10purecfma(float, float); +IMPORT CONST float Sleef_sinhf1_u10purecfma(float); +IMPORT CONST float Sleef_finz_sinhf1_u10purecfma(float); +IMPORT CONST float Sleef_coshf1_u10purecfma(float); +IMPORT CONST float Sleef_finz_coshf1_u10purecfma(float); +IMPORT CONST float Sleef_tanhf1_u10purecfma(float); +IMPORT CONST float Sleef_finz_tanhf1_u10purecfma(float); +IMPORT CONST float Sleef_sinhf1_u35purecfma(float); +IMPORT CONST float Sleef_finz_sinhf1_u35purecfma(float); +IMPORT CONST float Sleef_coshf1_u35purecfma(float); +IMPORT CONST float Sleef_finz_coshf1_u35purecfma(float); +IMPORT CONST float Sleef_tanhf1_u35purecfma(float); +IMPORT CONST float Sleef_finz_tanhf1_u35purecfma(float); +IMPORT CONST float Sleef_fastsinf1_u3500purecfma(float); +IMPORT CONST float Sleef_finz_fastsinf1_u3500purecfma(float); +IMPORT CONST float Sleef_fastcosf1_u3500purecfma(float); +IMPORT CONST float Sleef_finz_fastcosf1_u3500purecfma(float); +IMPORT CONST float Sleef_fastpowf1_u3500purecfma(float, float); +IMPORT CONST float Sleef_finz_fastpowf1_u3500purecfma(float, float); +IMPORT CONST float Sleef_asinhf1_u10purecfma(float); +IMPORT CONST float Sleef_finz_asinhf1_u10purecfma(float); +IMPORT CONST float Sleef_acoshf1_u10purecfma(float); +IMPORT CONST float Sleef_finz_acoshf1_u10purecfma(float); +IMPORT CONST float Sleef_atanhf1_u10purecfma(float); +IMPORT CONST float Sleef_finz_atanhf1_u10purecfma(float); +IMPORT CONST float Sleef_exp2f1_u10purecfma(float); +IMPORT CONST float Sleef_finz_exp2f1_u10purecfma(float); +IMPORT CONST float Sleef_exp2f1_u35purecfma(float); +IMPORT CONST float Sleef_finz_exp2f1_u35purecfma(float); +IMPORT CONST float Sleef_exp10f1_u10purecfma(float); +IMPORT CONST float Sleef_finz_exp10f1_u10purecfma(float); +IMPORT CONST float Sleef_exp10f1_u35purecfma(float); +IMPORT CONST float Sleef_finz_exp10f1_u35purecfma(float); +IMPORT CONST float Sleef_expm1f1_u10purecfma(float); +IMPORT CONST float Sleef_finz_expm1f1_u10purecfma(float); +IMPORT CONST float Sleef_log10f1_u10purecfma(float); +IMPORT CONST float Sleef_finz_log10f1_u10purecfma(float); +IMPORT CONST float Sleef_log2f1_u10purecfma(float); +IMPORT CONST float Sleef_finz_log2f1_u10purecfma(float); +IMPORT CONST float Sleef_log2f1_u35purecfma(float); +IMPORT CONST float Sleef_finz_log2f1_u35purecfma(float); +IMPORT CONST float Sleef_log1pf1_u10purecfma(float); +IMPORT CONST float Sleef_finz_log1pf1_u10purecfma(float); +IMPORT CONST Sleef_float_2 Sleef_sincospif1_u05purecfma(float); +IMPORT CONST Sleef_float_2 Sleef_finz_sincospif1_u05purecfma(float); +IMPORT CONST Sleef_float_2 Sleef_sincospif1_u35purecfma(float); +IMPORT CONST Sleef_float_2 Sleef_finz_sincospif1_u35purecfma(float); +IMPORT CONST float Sleef_sinpif1_u05purecfma(float); +IMPORT CONST float Sleef_finz_sinpif1_u05purecfma(float); +IMPORT CONST float Sleef_cospif1_u05purecfma(float); +IMPORT CONST float Sleef_finz_cospif1_u05purecfma(float); +IMPORT CONST float Sleef_fmaf1_purecfma(float, float, float); +IMPORT CONST float Sleef_finz_fmaf1_purecfma(float, float, float); +IMPORT CONST float Sleef_sqrtf1_purecfma(float); +IMPORT CONST float Sleef_finz_sqrtf1_purecfma(float); +IMPORT CONST float Sleef_sqrtf1_u05purecfma(float); +IMPORT CONST float Sleef_finz_sqrtf1_u05purecfma(float); +IMPORT CONST float Sleef_sqrtf1_u35purecfma(float); +IMPORT CONST float Sleef_finz_sqrtf1_u35purecfma(float); +IMPORT CONST float Sleef_hypotf1_u05purecfma(float, float); +IMPORT CONST float Sleef_finz_hypotf1_u05purecfma(float, float); +IMPORT CONST float Sleef_hypotf1_u35purecfma(float, float); +IMPORT CONST float Sleef_finz_hypotf1_u35purecfma(float, float); +IMPORT CONST float Sleef_fabsf1_purecfma(float); +IMPORT CONST float Sleef_finz_fabsf1_purecfma(float); +IMPORT CONST float Sleef_copysignf1_purecfma(float, float); +IMPORT CONST float Sleef_finz_copysignf1_purecfma(float, float); +IMPORT CONST float Sleef_fmaxf1_purecfma(float, float); +IMPORT CONST float Sleef_finz_fmaxf1_purecfma(float, float); +IMPORT CONST float Sleef_fminf1_purecfma(float, float); +IMPORT CONST float Sleef_finz_fminf1_purecfma(float, float); +IMPORT CONST float Sleef_fdimf1_purecfma(float, float); +IMPORT CONST float Sleef_finz_fdimf1_purecfma(float, float); +IMPORT CONST float Sleef_truncf1_purecfma(float); +IMPORT CONST float Sleef_finz_truncf1_purecfma(float); +IMPORT CONST float Sleef_floorf1_purecfma(float); +IMPORT CONST float Sleef_finz_floorf1_purecfma(float); +IMPORT CONST float Sleef_ceilf1_purecfma(float); +IMPORT CONST float Sleef_finz_ceilf1_purecfma(float); +IMPORT CONST float Sleef_roundf1_purecfma(float); +IMPORT CONST float Sleef_finz_roundf1_purecfma(float); +IMPORT CONST float Sleef_rintf1_purecfma(float); +IMPORT CONST float Sleef_finz_rintf1_purecfma(float); +IMPORT CONST float Sleef_nextafterf1_purecfma(float, float); +IMPORT CONST float Sleef_finz_nextafterf1_purecfma(float, float); +IMPORT CONST float Sleef_frfrexpf1_purecfma(float); +IMPORT CONST float Sleef_finz_frfrexpf1_purecfma(float); +IMPORT CONST float Sleef_fmodf1_purecfma(float, float); +IMPORT CONST float Sleef_finz_fmodf1_purecfma(float, float); +IMPORT CONST float Sleef_remainderf1_purecfma(float, float); +IMPORT CONST float Sleef_finz_remainderf1_purecfma(float, float); +IMPORT CONST Sleef_float_2 Sleef_modff1_purecfma(float); +IMPORT CONST Sleef_float_2 Sleef_finz_modff1_purecfma(float); +IMPORT CONST float Sleef_lgammaf1_u10purecfma(float); +IMPORT CONST float Sleef_finz_lgammaf1_u10purecfma(float); +IMPORT CONST float Sleef_tgammaf1_u10purecfma(float); +IMPORT CONST float Sleef_finz_tgammaf1_u10purecfma(float); +IMPORT CONST float Sleef_erff1_u10purecfma(float); +IMPORT CONST float Sleef_finz_erff1_u10purecfma(float); +IMPORT CONST float Sleef_erfcf1_u15purecfma(float); +IMPORT CONST float Sleef_finz_erfcf1_u15purecfma(float); +IMPORT CONST int Sleef_getIntf1_purecfma(int); +IMPORT CONST int Sleef_finz_getIntf1_purecfma(int); +IMPORT CONST void *Sleef_getPtrf1_purecfma(int); +IMPORT CONST void *Sleef_finz_getPtrf1_purecfma(int); +#endif +#ifdef __cplusplus +} +#endif + +#undef IMPORT +#endif // #ifndef __SLEEF_H__