diff --git a/.gitattributes b/.gitattributes index 0b10ef2aa75c0640f04a077b4f6b8df5dea6fab5..cc09dcb315e618ca061005ac6cab8c4251372fbf 100644 --- a/.gitattributes +++ b/.gitattributes @@ -142,3 +142,6 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_ .venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/simd.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/halide.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/wrapper.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cpu.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/common.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/common.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/common.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5a49ebeb21efcfc629f6282c3b21664840e7337d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/common.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee19b8e0b980d0895a7af50aa7c3244d133ce110a196485ab8cec5fa7b9767d4 +size 121452 diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3e2211e38b1ee8a071bceec78156e3a646d70f07 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8596ce3d305b9ea76fd93737e3fda25769b1901142db9efff0fde9757b03517 +size 262897 diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cpu.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cpu.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e326f6006cfa958bb3f6b92bc77d5d59ee49fc8e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cpu.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de158b207dec0ef6dd7cca5acc1db68fcc605b0046ed6c5ffcf0d9b8f34d3b82 +size 138985 diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ae13108b6a38ad8b359e6666ff1318f593095e3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/b2b_gemm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/b2b_gemm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc384dccfb91fb6dcf38b5fc9148272c38d44701 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/b2b_gemm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/binary_folding.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/binary_folding.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..733fc8d27968b2a245b81554b03297e3be206091 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/binary_folding.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/decompose_mem_bound_mm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/decompose_mem_bound_mm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..02f8ba3fd51a84088eccaf74cc025824a721a4dc Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/decompose_mem_bound_mm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/dedupe_symint_uses.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/dedupe_symint_uses.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb7e59e175f2b51b5089030a60d99c13865c8bfc Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/dedupe_symint_uses.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/efficient_conv_bn_eval.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/efficient_conv_bn_eval.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..67cedd2943e258946fba3ffe16a6381d81df2a9a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/efficient_conv_bn_eval.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/freezing_patterns.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/freezing_patterns.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..05416037695c63d94dc0be2b450bd696ca1f1ee8 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/freezing_patterns.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/fuse_attention.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/fuse_attention.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f2596968b3d2336bfb58eae6b7bbec8e410c4bec Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/fuse_attention.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/group_batch_fusion.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/group_batch_fusion.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ab211df50445bd0e0ae8d1e704ec137f1d39512 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/group_batch_fusion.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/joint_graph.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/joint_graph.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ea534f93f2fb94e2719bc25fd65001de2d09f31 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/joint_graph.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/micro_pipeline_tp.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/micro_pipeline_tp.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f50fa4ac65efa3635e534ad0ba84bee210897bc3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/micro_pipeline_tp.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/misc_patterns.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/misc_patterns.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2549dca51ec88e891ca03d2c3ad808d86334e9f7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/misc_patterns.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/mkldnn_fusion.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/mkldnn_fusion.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93563fe19207e92a05c8992c031ff31808c0158d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/mkldnn_fusion.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/numeric_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/numeric_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..60083539a8f660f702dfc111d70273d5b63d0c8a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/numeric_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pad_mm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pad_mm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..67c923407c6cfc76576838be92e269d7527b18e6 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pad_mm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/post_grad.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/post_grad.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f9ddd9680001b5937b43eee6cf210d56ca98861 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/post_grad.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pre_grad.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pre_grad.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..58c58ac4cd4ba85c625ddb8545282d060c0106c9 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pre_grad.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/quantization.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/quantization.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0c925ffc945d520c500efe94a066f8d341ff7dab Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/quantization.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/reinplace.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/reinplace.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a00dacc4d679317e150976db86f9c417f191c111 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/reinplace.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/replace_random.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/replace_random.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e7ce7d9590488e2ff848fc99109754b9f53543de Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/replace_random.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__init__.py b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b17d76e12794b8407063c84d5dbb55b3aac25c99 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__init__.py @@ -0,0 +1 @@ +from . import mm, mm_common, mm_plus_mm, unpack_mixed_mm diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..95eafd715ddf717597e38acac78c24bb1addc141 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/bmm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/bmm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..439ca3179d3eb0c7bd183c4b805eef127134005e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/bmm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/conv.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/conv.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5a83ec5ca0cfc13e7972e784f63bb57cb038ecb3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/conv.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/flex_attention.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/flex_attention.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4dfd0f99df0f6f8d7c1045d0e14a8726129e7439 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/flex_attention.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/flex_decoding.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/flex_decoding.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..860f94cffcb0b46f249f15f029d93e61198403b1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/flex_decoding.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..31abf219caacd182d3500f9e6f0284521514a883 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_common.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_common.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..741b3768a6fd15a6efbd1aaae1b5dbf61acac946 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_common.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_plus_mm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_plus_mm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93637ad8c893b8d84a23f46cf37219b4b618a4b6 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_plus_mm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_scaled.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_scaled.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f28d584150079a847127f642f02a6a478c612f17 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_scaled.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/unpack_mixed_mm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/unpack_mixed_mm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d9744724e3a503795e13cc2b0aaec8668017fafd Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/unpack_mixed_mm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/bmm.py b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/bmm.py new file mode 100644 index 0000000000000000000000000000000000000000..428ff06d4a052fa38fec17df8ad7a1d6591b507c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/bmm.py @@ -0,0 +1,192 @@ +# mypy: allow-untyped-defs +import logging + +import torch + +from .. import ir, lowering as L +from ..select_algorithm import ( + autotune_select_algorithm, + ExternKernelChoice, + TritonTemplate, +) +from ..utils import ( + ceildiv as cdiv, + use_aten_gemm_kernels, + use_cutlass_template, + use_triton_template, +) +from ..virtualized import V +from .mm import _is_static_problem +from .mm_common import addmm_epilogue, mm_args, mm_configs, mm_options + + +log = logging.getLogger(__name__) +aten = torch.ops.aten + + +def bmm_grid(b, m, n, meta): + return (cdiv(m, meta["BLOCK_M"]) * cdiv(n, meta["BLOCK_N"]), b, 1) + + +bmm_template = TritonTemplate( + name="bmm", + grid=bmm_grid, + source=r""" +{{def_kernel("A", "B")}} + M = {{size("A", -2)}} + N = {{size("B", -1)}} + K = {{size("A", -1)}} + + stride_aq = {{stride("A", 0)}} + stride_am = {{stride("A", 1)}} + stride_ak = {{stride("A", 2)}} + + stride_bq = {{stride("B", 0)}} + stride_bk = {{stride("B", 1)}} + stride_bn = {{stride("B", 2)}} + + # based on triton.ops.matmul + pid = tl.program_id(0) + grid_m = (M + BLOCK_M - 1) // BLOCK_M + grid_n = (N + BLOCK_N - 1) // BLOCK_N + + # re-order program ID for better L2 performance + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // (group_size) + + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + if (stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1): + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + else: + ram = rm % M + if (stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1): + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + else: + rbn = rn % N + + rk = tl.arange(0, BLOCK_K) + + idx_q = tl.program_id(1) # batch dimension for BMM + A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak + idx_q*stride_aq) + B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn + idx_q*stride_bq) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) + for k in range(K, 0, -BLOCK_K): + if EVEN_K: + a = tl.load(A) + b = tl.load(B) + else: + a = tl.load(A, mask=rk[None, :] < k, other=0.) + b = tl.load(B, mask=rk[:, None] < k, other=0.) + acc += tl.dot(a, b, allow_tf32=ALLOW_TF32) + A += BLOCK_K * stride_ak + B += BLOCK_K * stride_bk + + # rematerialize rm and rn to save registers + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + idx_q = tl.program_id(1) # batch dimension for BMM + idx_m = rm[:, None] + idx_n = rn[None, :] + mask = (idx_m < M) & (idx_n < N) + + # inductor generates a suffix + {{store_output(("idx_q", "idx_m", "idx_n"), "acc", "mask")}} +""", +) + +aten_bmm = ExternKernelChoice(torch.bmm, "at::bmm_out") +aten_baddbmm = ExternKernelChoice(torch.baddbmm, "at::baddbmm_out") + + +@L.register_lowering(aten.bmm) +def tuned_bmm(mat1, mat2, *, layout=None): + if all(x.get_device().type == "cpu" for x in [mat1, mat2]): + # decompose to small ops when memory bound + if mat1.get_size()[1] == 1 or mat2.get_size()[2] == 1: + mat1 = L.unsqueeze(mat1, -1) + mat2 = L.unsqueeze(mat2, 1) + return L.sum_(L.mul(mat1, mat2), axis=2) + + def is_valid_to_require_contiguous(t): + if not ir.is_storage_and_layout(t): + return True + _, layout = ir.as_storage_and_layout(t, freeze=False) + return isinstance(layout, ir.FlexibleLayout) + + def is_preferred_layout_as_bmm_input(sizes, strides): + # contiguous on one of the last two dims + return ( + strides[-1] == 1 and (sizes[-2] == 1 or strides[-2] >= sizes[-1]) + ) or (strides[-2] == 1 and (sizes[-1] == 1 or strides[-1] >= sizes[-2])) + + # Make the input of bmm contiguous + # if it is not contiguous on either of the last two dims, + # because bmm cpu implementation would do contiguous() if not. + # This is to avoid additional copies in bmm. + def may_require_contiguous(t, meta_t): + sizes = meta_t.meta["val"].size() + strides = meta_t.meta["val"].stride() + if not is_preferred_layout_as_bmm_input(sizes, strides): + t = ir.ExternKernel.require_contiguous(t) + return t + + if is_valid_to_require_contiguous(mat1): + meta_mat1 = V.graph.current_node.args[0] + mat1 = may_require_contiguous(mat1, meta_mat1) + if is_valid_to_require_contiguous(mat2): + meta_mat2 = V.graph.current_node.args[1] + mat2 = may_require_contiguous(mat2, meta_mat2) + + m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout) + + # options to tune from + choices = [aten_bmm.bind((mat1, mat2), layout)] if use_aten_gemm_kernels() else [] + if use_triton_template(layout): + for config in mm_configs(m, n, k): + bmm_template.maybe_append_choice( + choices, + input_nodes=(mat1, mat2), + layout=layout, + **mm_options(config, m, n, k, layout), + ) + static_shape, is_nonzero = _is_static_problem([mat1, mat2], layout) + if static_shape and is_nonzero and use_cutlass_template(layout, m, n, k): + from ..codegen.cuda.gemm_template import CUTLASS3xGemmTemplate + + CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(choices, layout, [mat1, mat2]) + + if len(choices) == 0: + log.warning("No choices for GEMM, using ATen backend as fallback") + choices.append(aten_bmm.bind((mat1, mat2), layout)) + + return autotune_select_algorithm("bmm", choices, [mat1, mat2], layout) + + +# Don't register this since it is slower than decomposing it +# @L.register_lowering(aten.baddbmm) +def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None): + m, n, k, layout, mat1, mat2, inp = mm_args(mat1, mat2, inp, layout=layout) + + # options to tune from + choices = ( + [aten_baddbmm.bind((inp, mat1, mat2), layout, alpha=alpha, beta=beta)] + if use_aten_gemm_kernels() + else [] + ) + if use_triton_template(layout): + for config in mm_configs(m, n, k): + bmm_template.maybe_append_choice( + choices, + input_nodes=(inp, mat1, mat2), + layout=layout, + **mm_options(config, m, n, k, layout), + prefix_args=1, + epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta), + ) + + return autotune_select_algorithm("baddbmm", choices, [inp, mat1, mat2], layout) diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/conv.py b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/conv.py new file mode 100644 index 0000000000000000000000000000000000000000..b69143fe0301545c53557886ed5701674e57565a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/conv.py @@ -0,0 +1,679 @@ +# mypy: allow-untyped-decorators +# mypy: allow-untyped-defs +from __future__ import annotations + +import functools +import logging +from typing import cast, List, Optional, Sequence, Tuple, TYPE_CHECKING, TypedDict + +import torch + +from .. import config, ir +from ..lowering import ( + add_layout_constraint, + constrain_to_fx_strides, + lowerings as L, + register_lowering, +) +from ..select_algorithm import ( + autotune_select_algorithm, + ExternKernelChoice, + TritonTemplate, +) +from ..utils import ( + ceildiv, + is_ones, + is_zeros, + pad_listlike, + sympy_product, + use_triton_template, +) +from ..virtualized import V +from .mm_common import filtered_configs + + +if TYPE_CHECKING: + from ..ir import TensorBox + +log = logging.getLogger(__name__) + + +aten = torch.ops.aten + + +def conv2d_grid(n, c, h, w, meta): + return ( + ceildiv(n * h * w, meta["BLOCK_M"]), + ceildiv(c, meta["BLOCK_N"]), + meta["GROUPS"], + ) + + +def conv3d_grid(n, c, d, h, w, meta): + return ( + ceildiv(n * d * h * w, meta["BLOCK_M"]), + ceildiv(c, meta["BLOCK_N"]), + meta["GROUPS"], + ) + + +# List of dictionaries to store the kernel configs. Configs that evaluate to true +# will be utilised on the target platform +kernel_configs = [ + # "BLOCK_M", "BLOCK_N", "BLOCK_K", "num_stages", "num_warps" + {"config": (64, 256, 16, 2, 4), "cond": True}, + {"config": (256, 64, 16, 2, 4), "cond": True}, + {"config": (1024, 16, 16, 1, 8), "cond": True}, + {"config": (128, 128, 32, 2, 8), "cond": True}, + {"config": (64, 64, 32, 2, 4), "cond": True}, + {"config": (64, 256, 32, 2, 8), "cond": True}, + {"config": (256, 64, 32, 2, 8), "cond": True}, +] + +# Create filtered list of configs based on conv +platform_configs = tuple( + cast(Tuple[int, int, int, int, int], config["config"]) + for config in kernel_configs + if config["cond"] +) + +# On ROCm convert num_stages to 1 as pipelining provides no benefit +if torch.version.hip: + platform_configs = tuple( + (config[0], config[1], config[2], 1, config[4]) for config in platform_configs + ) + +conv_configs = functools.partial( + filtered_configs, + configs=platform_configs, +) + +LOOP_BODY_2D = """ + idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H + idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W + idx_x_c = tl.arange(0, BLOCK_K) + k + + x_ptrs = x_base + ( + (idx_x_h * stride_xh)[:, None] + + (idx_x_w * stride_xw)[:, None] + + (idx_x_c * stride_xc)[None, :] + ) + mask_x = ( + (idx_n < BATCH)[:, None] + & (idx_x_h >= 0)[:, None] + & (idx_x_h < IN_H)[:, None] + & (idx_x_w >= 0)[:, None] + & (idx_x_w < IN_W)[:, None] + & (idx_x_c < GROUP_IN_C)[None, :] + ) + matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0) + + w_ptrs = w_base + ( + (idx_x_c * stride_wc_in)[:, None] + (i * stride_wh) + (j * stride_ww) + ) + mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C) + matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0) + acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32) +""" + +""" +This is a relatively simple conv implementation that can likely be +improved. Many alternate conv versions can be found here: +https://github.com/pytorch/torchdynamo/pull/971 +""" +conv2d_template = TritonTemplate( + name="convolution2d", + grid=conv2d_grid, + source=r""" +{{def_kernel("X", "W")}} + # Tensor dimensions + BATCH = {{size("X", 0)}} + IN_C = {{size("X", 1)}} + IN_H = {{size("X", 2)}} + IN_W = {{size("X", 3)}} + OUT_C = {{size(None, 1)}} + OUT_H = {{size(None, 2)}} + OUT_W = {{size(None, 3)}} + + # Strides: + stride_xn = {{stride("X", 0)}} + stride_xc = {{stride("X", 1)}} + stride_xh = {{stride("X", 2)}} + stride_xw = {{stride("X", 3)}} + stride_wc_out = {{stride("W", 0)}} + stride_wc_in = {{stride("W", 1)}} + stride_wh = {{stride("W", 2)}} + stride_ww = {{stride("W", 3)}} + + nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M) + idx_y_w = nhw % OUT_W + nh = nhw // OUT_W + idx_y_h = nh % OUT_H + idx_n = nh // OUT_H + idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N) + +{% if GROUPS == 1 %} + group = 0 + GROUP_IN_C = IN_C + GROUP_OUT_C = OUT_C +{% else %} + group = tl.program_id(2) + GROUP_IN_C = IN_C // GROUPS + GROUP_OUT_C = OUT_C // GROUPS +{% endif %} + + x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None] + w_base = ( + W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :] + ) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + +{% if UNROLL %} +{% for i in range(KERNEL_H) %} +{% for j in range(KERNEL_W) %} + i = {{i}} + j = {{j}} + for k in range(0, GROUP_IN_C, BLOCK_K): + """ + + LOOP_BODY_2D + + """ +{% endfor %} +{% endfor %} +{% else %} + # Could be simplified, but slightly slower: + # for i in range(KERNEL_H): + # for j in range(KERNEL_W): + # for k in range(0, GROUP_IN_C, BLOCK_K): + BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K + for ijk in range(KERNEL_H * KERNEL_W * BLOCK_K_COUNT): + k = (ijk % BLOCK_K_COUNT) * BLOCK_K + ij = ijk // BLOCK_K_COUNT + i = ij // KERNEL_W + j = ij % KERNEL_W + """ + + LOOP_BODY_2D + + """ +{% endif %} + + mask = ( + (idx_n < BATCH)[:, None] + & (idx_y_h < OUT_H)[:, None] + & (idx_y_w < OUT_W)[:, None] + & (idx_y_c < GROUP_OUT_C)[None, :] + ) + idx_n = idx_n[:, None] + idx_c = idx_y_c[None, :] + group * GROUP_OUT_C + idx_h = idx_y_h[:, None] + idx_w = idx_y_w[:, None] + + # inductor generates a suffix + {{store_output(("idx_n", "idx_c", "idx_h", "idx_w"), "acc", "mask")}} +""", +) + +LOOP_BODY_3D = """ + idx_x_d = d - PADDING_D + idx_y_d * STRIDE_D + idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H + idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W + idx_x_c = tl.arange(0, BLOCK_K) + k + + x_ptrs = x_base + ( + (idx_x_d * stride_xd)[:, None] + + (idx_x_h * stride_xh)[:, None] + + (idx_x_w * stride_xw)[:, None] + + (idx_x_c * stride_xc)[None, :] + ) + mask_x = ( + (idx_n < BATCH)[:, None] + & (idx_x_d >= 0)[:, None] + & (idx_x_d < IN_D)[:, None] + & (idx_x_h >= 0)[:, None] + & (idx_x_h < IN_H)[:, None] + & (idx_x_w >= 0)[:, None] + & (idx_x_w < IN_W)[:, None] + & (idx_x_c < GROUP_IN_C)[None, :] + ) + matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0) + + w_ptrs = w_base + ( + (idx_x_c * stride_wc_in)[:, None] + + (d * stride_wd) + (i * stride_wh) + (j * stride_ww) + ) + mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C) + matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0) + acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32) +""" + +conv3d_template = TritonTemplate( + name="convolution3d", + grid=conv3d_grid, + source=r""" +{{def_kernel("X", "W")}} + # Tensor dimensions + BATCH = {{size("X", 0)}} + IN_C = {{size("X", 1)}} + IN_D = {{size("X", 2)}} + IN_H = {{size("X", 3)}} + IN_W = {{size("X", 4)}} + OUT_C = {{size(None, 1)}} + OUT_D = {{size(None, 2)}} + OUT_H = {{size(None, 3)}} + OUT_W = {{size(None, 4)}} + + # Strides: + stride_xn = {{stride("X", 0)}} + stride_xc = {{stride("X", 1)}} + stride_xd = {{stride("X", 2)}} + stride_xh = {{stride("X", 3)}} + stride_xw = {{stride("X", 4)}} + stride_wc_out = {{stride("W", 0)}} + stride_wc_in = {{stride("W", 1)}} + stride_wd = {{stride("W", 2)}} + stride_wh = {{stride("W", 3)}} + stride_ww = {{stride("W", 4)}} + + ndhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M) + idx_y_w = ndhw % OUT_W + ndh = ndhw // OUT_W + idx_y_h = ndh % OUT_H + nd = ndh // OUT_H + idx_y_d = nd % OUT_D + idx_n = nd // OUT_D + idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N) + +{% if GROUPS == 1 %} + group = 0 + GROUP_IN_C = IN_C + GROUP_OUT_C = OUT_C +{% else %} + group = tl.program_id(2) + GROUP_IN_C = IN_C // GROUPS + GROUP_OUT_C = OUT_C // GROUPS +{% endif %} + + x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None] + w_base = ( + W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :] + ) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + +{% if UNROLL %} +{% for d in range(KERNEL_D) %} +{% for i in range(KERNEL_H) %} +{% for j in range(KERNEL_W) %} + d = {{d}} + i = {{i}} + j = {{j}} + for k in range(0, GROUP_IN_C, BLOCK_K): + """ + + LOOP_BODY_3D + + """ +{% endfor %} +{% endfor %} +{% endfor %} +{% else %} + # Could be simplified, but slightly slower: + # for d in range(KERNEL_D): + # for i in range(KERNEL_H): + # for j in range(KERNEL_W): + # for k in range(0, GROUP_IN_C, BLOCK_K): + BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K + for dijk in range(KERNEL_D * KERNEL_H * KERNEL_W * BLOCK_K_COUNT): + k = (dijk % BLOCK_K_COUNT) * BLOCK_K + dij = dijk // BLOCK_K_COUNT + j = dij % KERNEL_W + di = dij // KERNEL_W + i = di % KERNEL_H + d = di // KERNEL_H + """ + + LOOP_BODY_3D + + """ +{% endif %} + + mask = ( + (idx_n < BATCH)[:, None] + & (idx_y_d < OUT_D)[:, None] + & (idx_y_h < OUT_H)[:, None] + & (idx_y_w < OUT_W)[:, None] + & (idx_y_c < GROUP_OUT_C)[None, :] + ) + idx_n = idx_n[:, None] + idx_c = idx_y_c[None, :] + group * GROUP_OUT_C + idx_d = idx_y_d[:, None] + idx_h = idx_y_h[:, None] + idx_w = idx_y_w[:, None] + + # inductor generates a suffix + {{store_output(("idx_n", "idx_c", "idx_d", "idx_h", "idx_w"), "acc", "mask")}} +""", +) + +aten_convolution = ExternKernelChoice( + torch.convolution, + "at::convolution", + has_out_variant=False, + op_overload=aten.convolution.default, +) + + +def conv1x1_via_mm(x, w, *, out): + w = torch.squeeze(torch.squeeze(w, -1), -1) + return torch.matmul( + x.permute(0, 2, 3, 1), w.permute(1, 0), out=out.permute(0, 2, 3, 1) + ) + + +aten_conv1x1_via_mm = ExternKernelChoice(conv1x1_via_mm, None) + + +class ConvLayoutParams(TypedDict): + stride: tuple[int, ...] + padding: tuple[int, ...] + dilation: tuple[int, ...] + transposed: bool + output_padding: tuple[int, ...] + groups: int + + +def conv_layout( + x: TensorBox, + weight: TensorBox, + bias: Optional[TensorBox], + stride: Sequence[int], + padding: tuple[int, ...], + dilation: tuple[int, ...], + transposed: bool, + output_padding: tuple[int, ...], + groups: int, +) -> ir.Layout: + """Determine output layout for a convolution""" + with V.graph.fake_mode: + output = torch.ops.aten.convolution( + ir.ir_node_to_tensor(x, guard_shape=True), + ir.ir_node_to_tensor(weight, guard_shape=True), + ir.ir_node_to_tensor(bias, guard_shape=True), + V.graph.sizevars.size_hints(stride), # type: ignore[arg-type] + V.graph.sizevars.size_hints(padding), # type: ignore[arg-type] + V.graph.sizevars.size_hints(dilation), # type: ignore[arg-type] + transposed, + V.graph.sizevars.size_hints(output_padding), # type: ignore[arg-type] + groups, + ) + sizes = ir.convert_shape_to_inductor(output.size()) + stride = ir.convert_shape_to_inductor(output.stride()) # type: ignore[assignment] + + return ir.FixedLayout( + x.get_device(), + x.get_dtype(), + sizes, + stride, + ) + + +def channels_last_order(rank): + order = list(reversed(range(rank))) + order.insert(1, order.pop(-1)) + return order + + +def convert_1x1_conv_to_mm(x, weight, bias): + # special case for 1x1 convolution, which is actually just a matmul + rank = len(weight.get_size()) + for _ in range(rank - 2): + weight = L[aten.squeeze](weight, dim=-1) + weight = L[aten.permute](weight, [1, 0]) + + x = ir.ExternKernel.require_stride_order(x, channels_last_order(rank)) + x_permute = list(range(rank)) + x_permute.append(x_permute.pop(1)) + x = L[aten.permute](x, x_permute) + *sizes, in_chan = x.get_size() + x = L[aten.reshape](x, [sympy_product(sizes), in_chan]) + if bias is None: + result = L[aten.mm](x, weight) + else: + result = L[aten.addmm](bias, x, weight) + result = L[aten.reshape](result, [*sizes, -1]) + result_permute = list(range(rank)) + result_permute.insert(1, result_permute.pop(-1)) + return L[aten.permute](result, result_permute) + + +@register_lowering(aten.convolution) +def convolution( + x: TensorBox, + weight: TensorBox, + bias: TensorBox, + stride: List[int], + padding: List[int], + dilation: List[int], + transposed: bool, + output_padding: List[int], + groups: int, +): + stride = tuple(stride) + padding = tuple(padding) + dilation = tuple(dilation) + output_padding = tuple(output_padding) + if not isinstance(groups, int): + groups = V.graph.sizevars.evaluate_static_shape(groups) + assert isinstance(groups, int) + + # Need use hint for triton template since the template does not + # work with a dynamic shape. + # + # No need to evaluate_static_shape for dilation and output_padding + # since the template is only used when dilation is 1 and output_padding + # is 0. + stride = tuple(V.graph.sizevars.evaluate_static_shapes(stride)) + padding = tuple(V.graph.sizevars.evaluate_static_shapes(padding)) + + kwargs: ConvLayoutParams = { + "stride": stride, + "padding": padding, + "dilation": dilation, + "transposed": transposed, + "output_padding": output_padding, + "groups": groups, + } + + if len(x.get_size()) == len(weight.get_size()) - 1: + # add batch dimension to simplify rest of function + return L[aten.squeeze]( + convolution(L[aten.expand](x, [1, *x.get_size()]), weight, bias, **kwargs), + dim=0, + ) + + out_chan, in_chan, *kernel_shape = V.graph.sizevars.evaluate_static_shapes( + weight.get_size() + ) + ndim = len(kernel_shape) + stride = pad_listlike(stride, ndim) + padding = pad_listlike(padding, ndim) + dilation = pad_listlike(dilation, ndim) + output_padding = pad_listlike(output_padding, ndim) + + def channels_last_conv(): + if V.graph.layout_opt and ndim == 2: + return True + + layout = conv_layout(x, weight, None, **kwargs) + req_stride_order = ir.get_stride_order( + V.graph.sizevars.size_hints(layout.stride) + ) + return req_stride_order == ir.NHWC_STRIDE_ORDER + + autotuning_gemm = config.max_autotune or config.max_autotune_gemm + + if ( + (config.conv_1x1_as_mm or (autotuning_gemm and channels_last_conv())) + and is_ones(kernel_shape) + and is_ones(stride) + and is_zeros(padding) + and is_ones(dilation) + and not transposed + and is_zeros(output_padding) + and groups == 1 + and V.graph.sizevars.statically_known_gt(sympy_product(x.get_size()), 0) + ): + return convert_1x1_conv_to_mm(x, weight, bias) + + if bias is not None and ir.get_device_type(x) != "cpu": + # peel off the bias, cudnn is slower with it + result = convolution(x, weight, None, **kwargs) + return L[aten.add]( + result, L[aten.view](bias, [result.get_size()[1]] + ndim * [1]) + ) + + x.realize() + weight.realize() + + # ndim can be 1 for convolution in models such as demucs + # TODO: check if it's beneficial to convert Conv1d to Conv2d and then + # apply channels last. + if V.graph.layout_opt and ndim == 2: + V.graph.num_channels_last_conv += 1 + x = ir.ExternKernel.require_channels_last(x) + # TODO maybe we can convert weights to channels last just once before + # running the model. + weight = ir.ExternKernel.require_channels_last(weight) + layout = conv_layout(x, weight, None, **kwargs) + else: + layout = conv_layout(x, weight, None, **kwargs) + req_stride_order = ir.get_stride_order( + V.graph.sizevars.size_hints(layout.stride) + ) + x = ir.ExternKernel.require_stride_order(x, req_stride_order) + weight = ir.ExternKernel.require_stride_order(weight, req_stride_order) + + ordered_kwargs_for_cpp_kernel = [ + "stride", + "padding", + "dilation", + "transposed", + "output_padding", + "groups", + ] + if bias is None: + args = [x, weight] + kwargs["bias"] = None # type: ignore[typeddict-unknown-key] + ordered_kwargs_for_cpp_kernel.insert(0, "bias") + else: + args = [x, weight, bias] + bias.realize() + bias.freeze_layout() + V.graph.sizevars.evaluate_static_shapes(bias.get_size()) + + choices = [] + if torch._inductor.utils._use_conv_autotune_backend("ATEN"): + choices = [ + aten_convolution.bind( + args, + layout, + ordered_kwargs_for_cpp_kernel, + **kwargs, + ) + ] + + if ( + torch._inductor.utils._use_conv_autotune_backend("TRITON") + and use_triton_template(layout) + # templates only support these: + and is_ones(dilation) + and not transposed + and is_zeros(output_padding) + # there are some odd models where this check fails (e.g. shufflenet_v2_x1_0) + and V.graph.sizevars.statically_known_equals(in_chan, x.get_size()[1]) # type: ignore[arg-type] + ): + if ( + is_ones(kernel_shape) + and is_ones(stride) + and is_zeros(padding) + and groups == 1 + ): + choices.append(aten_conv1x1_via_mm.bind(args, layout)) + + for cfg in conv_configs( + sympy_product([x.get_size()[0], *x.get_size()[2:]]), + out_chan, + in_chan, + ): + if ndim == 2: + conv2d_template.maybe_append_choice( + choices, + input_nodes=(x, weight), + layout=layout, + KERNEL_H=kernel_shape[0], + KERNEL_W=kernel_shape[1], + STRIDE_H=stride[0], + STRIDE_W=stride[1], + PADDING_H=padding[0], + PADDING_W=padding[1], + GROUPS=groups, + # TODO(jansel): try unroll for bigger kernels once fixed: + # https://github.com/openai/triton/issues/1254 + UNROLL=is_ones(kernel_shape), + ALLOW_TF32=torch.backends.cudnn.allow_tf32, + num_stages=cfg.num_stages, + num_warps=cfg.num_warps, + **cfg.kwargs, + ) + elif ndim == 3: + conv3d_template.maybe_append_choice( + choices, + input_nodes=(x, weight), + layout=layout, + KERNEL_D=kernel_shape[0], + KERNEL_H=kernel_shape[1], + KERNEL_W=kernel_shape[2], + STRIDE_D=stride[0], + STRIDE_H=stride[1], + STRIDE_W=stride[2], + PADDING_D=padding[0], + PADDING_H=padding[1], + PADDING_W=padding[2], + GROUPS=groups, + # TODO(jansel): try unroll for bigger kernels once fixed: + # https://github.com/openai/triton/issues/1254 + UNROLL=is_ones(kernel_shape), + ALLOW_TF32=torch.backends.cudnn.allow_tf32, + num_stages=cfg.num_stages, + num_warps=cfg.num_warps, + **cfg.kwargs, + ) + + return autotune_select_algorithm("convolution", choices, args, layout) + + +@register_lowering(aten._convolution) +def _convolution( + x, + weight, + bias, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + benchmark, + deterministic, + cudnn_enabled, + allow_tf32, +): + return convolution( + x, weight, bias, stride, padding, dilation, transposed, output_padding, groups + ) + + +def constrain_conv_to_fx_strides(fx_node, *args, **kwargs): + assert fx_node.target == torch.ops.aten.convolution.default + if V.graph.layout_opt: + return args, kwargs + else: + return constrain_to_fx_strides(fx_node, *args, **kwargs) + + +add_layout_constraint(aten.convolution, constrain_conv_to_fx_strides) diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/flex_attention.py b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/flex_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..567c4c5fefbf8688b8f5e4b1eac247535c8edb32 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/flex_attention.py @@ -0,0 +1,1843 @@ +# mypy: allow-untyped-defs +""" Triton Implementation of the flex_attention Kernel""" + +import logging +import math +from typing import Any, List, Optional, Sequence, Tuple + +import sympy + +import torch +from torch._inductor.virtualized import V +from torch.utils._pytree import tree_map + +from .. import config +from ..ir import ( + ComputedBuffer, + ExternKernel, + FixedLayout, + FlexibleLayout, + get_stride_order, + InputBuffer, + IRNode, + StorageBox, + stride_order2fill_order, + Subgraph, + TensorBox, +) +from ..lowering import empty, empty_strided, lowerings, register_lowering +from ..select_algorithm import autotune_select_algorithm, realize_inputs, TritonTemplate + + +log = logging.getLogger(__name__) +aten = torch.ops.aten +Expr = sympy.Expr + + +def construct_strides( + sizes: Sequence[int], + fill_order: Sequence[int], +) -> Sequence[int]: + """From a list of sizes and a fill order, construct the strides of the permuted tensor.""" + # Initialize strides + assert len(sizes) == len( + fill_order + ), "Length of sizes must match the length of the fill order" + strides = [0] * len(sizes) + + # Start with stride 1 for the innermost dimension + current_stride = 1 + + # Iterate through the fill order populating strides + for dim in fill_order: + strides[dim] = current_stride + current_stride *= sizes[dim] + + return strides + + +def flex_attention_grid(batch_size, q_heads, num_queries, d_model, meta): + """How is this kernel parallelized? + We create a grid of (batch_size * num_heads, ceil_div(n_queries, query_block_size), 1) + Each block is responsible for iterating over blocks of keys and values calculating + the final attention output. + """ + import triton + + return (triton.cdiv(num_queries, meta["BLOCK_M"]), batch_size * q_heads, 1) + + +def create_placeholder( + name: str, dtype: torch.dtype, device: torch.device +) -> TensorBox: + """Creates a placeholder input buffers for producing subgraph_output.""" + input_buffer = InputBuffer(name, FixedLayout(device, dtype, [], [])) + return TensorBox.create(input_buffer) + + +def maybe_realize(args: List[Optional[IRNode]]): + """Accepts a list of optional IRNodes and returns a list of realized IRNodes""" + return tree_map(lambda x: realize_inputs(x) if x is not None else None, args) + + +def get_float32_precision(): + if torch.get_float32_matmul_precision() == "highest" or torch.version.hip: + return "'ieee'" + else: + return "'tf32'" + + +def build_subgraph_buffer( + args: List[TensorBox], + subgraph: Subgraph, +): + """This function's goal is to take in the required args and produce the subgraph buffer + The subgraph buffer is a ComputedBuffer that will be inlined into the triton template + + Args: + args: The args that are passed into the subgraph. Contains both fixed and lifted inputs. + subgraph: The Subgraph ir for which to produce the output node + """ + cnt = 0 + env = {} + for node in subgraph.graph_module.graph.nodes: + # There are two classes of placeholder inpts that we need + # to handle differently. For the first n_scalar_inps inputs + # we expect that these placeholders were generated by the make_fx call + # in the flex Attention HOP. So we need to create a new placeholder + # TensorBox for each of these inputs. For the rest of the inputs we + # expect that these are lifted inputs that fill up the '*other_buffers' + # tuple and already have corresponding TensorBoxes passed in as args. + if node.op == "placeholder": + env[node] = args[cnt] + cnt += 1 + elif node.op == "call_function": + # For call_function we use the default lowerings and pass in the + # already created TensorBoxes as args + + args, kwargs = tree_map( + lambda x: env[x] if x in env else x, (node.args, node.kwargs) + ) + env[node] = lowerings[node.target](*args, **kwargs) + elif node.op == "output": + + def convert_output_node_to_buffer(output): + if output is None: + return None + output_node = output + output_buffer = env[output_node] + assert isinstance(output_buffer, TensorBox), ( + "The output node for flex attention's subgraph must be a TensorBox, but got: ", + type(output_buffer), + ) + assert isinstance(output_buffer.data, StorageBox), ( + "The output node for the flex attention subgraph must be a StorageBox, but got: ", + type(output_buffer), + ) + subgraph_buffer = ComputedBuffer( + name=None, + layout=FlexibleLayout( + device=output_buffer.data.get_device(), + dtype=output_buffer.data.get_dtype(), + size=output_buffer.data.get_size(), + ), + data=output_buffer.data.data, # type: ignore[arg-type] + ) + return subgraph_buffer + + # node.args[0] is either a single element or a list of elements + # representing all outputs of the function. + return tree_map(convert_output_node_to_buffer, node.args[0]) + + raise ValueError("FlexAttention was passed a subgraph with no output node!") + + +# Inner Triton functions shared by flex_attention & split-k decoding kernels. +compute_next_offset_func = r""" +@triton.jit +def get_offset_for_next_block(loop_iter, col_indices, total_blocks, SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK): + cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE + cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last") + next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks) + needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0 + jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK + + offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK + return offset +""" + +compute_flex_attention = r""" +{{def_kernel("Q", "K", "V", "LSE", "KV_NUM_BLKS", "KV_IDX", "FULL_KV_NUM_BLKS", "FULL_KV_IDX")}} + # Sub notation for this kernel: + # + # Q: Query, K: Key, V: Value + # M: Number of queries, N: Number of keys/values, D: Model dimension + # QK_HEAD_DIM: The dimension of the query and key embeddings + # V_HEAD_DIM: The dimension of the value embeddings + # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head + # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups. + # + # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid. + # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query. + # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query. + # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query. + # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query. + # + # OUTPUT_LOGSUMEXP: We only need to store the logsumexp if we require grad + # + # (Modifiable) Performance tuning options + # BLOCK_M: The thread block size across the seqlen dim of Q. + # BLOCK_N: Iterate over BLOCK_N across the seqlen dim of K/V in each thread block. + + # The below are kernel options that can be applied for certain score_mods, + # or involve a numerics vs. perf tradeoff + # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has + # about 20% more numerical error, but slightly faster. + # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row + # is not masked out? If so, we can skip an extra safety check + + tl.static_assert(SPARSE_Q_BLOCK_SIZE >= BLOCK_M and SPARSE_Q_BLOCK_SIZE % BLOCK_M == 0) + tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0) + + # Define strides of inputs + stride_qz, stride_qh, stride_qm, stride_qk = {{stride("Q")}} + stride_kz, stride_kh, stride_kn, stride_kk = {{stride("K")}} + stride_vz, stride_vh, stride_vn, stride_vk = {{stride("V")}} + + Z = {{size("Q", 0)}} + HQ = {{size("Q", 1)}} + Q_LEN = {{size("Q", 2)}} + KV_LEN = {{size("K", 2)}} + + MATMUL_PRECISION = Q.dtype.element_ty + + q_start = tl.program_id(0) + off_z = tl.program_id(1) // HQ + off_hq = tl.program_id(1) % HQ + off_hkv = off_hq // GQA_SHARED_HEADS + off_g = off_hq % GQA_SHARED_HEADS + + q_offset = off_z * stride_qz + off_hq * stride_qh + k_offset = off_z * stride_kz + off_hkv * stride_kh + v_offset = off_z * stride_vz + off_hkv * stride_vh + + Q = Q + q_offset + K = K + k_offset + V = V + v_offset + + SPARSE_Z = {{size("KV_NUM_BLKS", 0)}} + SPARSE_HQ = {{size("KV_NUM_BLKS", 1)}} + + sparse_idx_z = off_z % SPARSE_Z + sparse_idx_hq = off_hq % SPARSE_HQ + + SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M) + SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N) + + SPARSE_Q_BLOCK_CNT: tl.constexpr = tl.cdiv(Q_LEN, SPARSE_Q_BLOCK_SIZE) + SPARSE_KV_BLOCK_CNT: tl.constexpr = tl.cdiv(KV_LEN, SPARSE_KV_BLOCK_SIZE) + + # initialize pointer to m and l + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + acc = tl.zeros([BLOCK_M, V_HEAD_DIM], dtype=tl.float32) + + offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M) + + # KV_IDX and KV_NUM_BLKS are always contiguous. + sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq + sparse_kv_num_blks_offset = sparse_hz_offset * SPARSE_Q_BLOCK_CNT + q_start // SPARSE_Q_MULTIPLE + sparse_kv_idx_offset = sparse_hz_offset * SPARSE_Q_BLOCK_CNT * SPARSE_KV_BLOCK_CNT + (q_start // SPARSE_Q_MULTIPLE) * SPARSE_KV_BLOCK_CNT # noqa: B950 + + Q_block_ptr = tl.make_block_ptr( + base=Q, + shape=(Q_LEN, QK_HEAD_DIM), + strides=(stride_qm, stride_qk), + offsets=(q_start * BLOCK_M, 0), + block_shape=(BLOCK_M, QK_HEAD_DIM), + order=(1, 0) + ) + + # load q: it stays in SRAM throughout the inner loop. + if IS_DIVISIBLE: + q = tl.load(Q_block_ptr) + else: + # boundary check is not free, so we only do it when necessary. + q = tl.load(Q_block_ptr, boundary_check=(0,), padding_option = "zero") + + # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # We don't know anything "special" about these blocks, so we need to apply + # both score_mod and mask_mod to it + kv_indices = KV_IDX + sparse_kv_idx_offset + kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading + kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset) + block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1)) + + K_block_ptr = tl.make_block_ptr( + base=K, + shape=(QK_HEAD_DIM, KV_LEN), + strides=(stride_kk, stride_kn), + offsets=(0, kv_start), + block_shape=(QK_HEAD_DIM, BLOCK_N), + order=(0, 1) + ) + V_block_ptr = tl.make_block_ptr( + base=V, + shape=(KV_LEN, V_HEAD_DIM), + strides=(stride_vn, stride_vk), + offsets=(kv_start, 0), + block_shape=(BLOCK_N, V_HEAD_DIM), + order=(1, 0) + ) + offs_n = kv_start + tl.arange(0, BLOCK_N) + + acc, l_i, m_i = forward_inner( + {{gen_argdefs()}}, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + acc, l_i, m_i, + off_z, off_hq, offs_m[:, None], offs_n[None, :], + kv_indices, kv_num_blocks, + 0, block_n_end, + MATMUL_PRECISION, + IS_FULL_BLOCKS=False, + ) + + # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # We know these blocks are guaranteed to be "full", so we don't need to + # apply mask_mod to them - only score_mod + if HAS_FULL_BLOCKS: + # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous. + kv_indices = FULL_KV_IDX + sparse_kv_idx_offset + kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading + kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset) + block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1)) + + K_block_ptr = tl.make_block_ptr( + base=K, + shape=(QK_HEAD_DIM, KV_LEN), + strides=(stride_kk, stride_kn), + offsets=(0, kv_start), + block_shape=(QK_HEAD_DIM, BLOCK_N), + order=(0, 1) + ) + V_block_ptr = tl.make_block_ptr( + base=V, + shape=(KV_LEN, V_HEAD_DIM), + strides=(stride_vn, stride_vk), + offsets=(kv_start, 0), + block_shape=(BLOCK_N, V_HEAD_DIM), + order=(1, 0) + ) + offs_n = kv_start + tl.arange(0, BLOCK_N) + + acc, l_i, m_i = forward_inner( + {{gen_argdefs()}}, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + acc, l_i, m_i, + off_z, off_hq, offs_m[:, None], offs_n[None, :], + kv_indices, kv_num_blocks, + 0, block_n_end, + MATMUL_PRECISION, + IS_FULL_BLOCKS=True, + ) + + + # [Note] Handle fully masked out rows: + # Li will be the sum(e^(-inf)) == 0.0 for masked out rows, mi will be -inf. + # We set Li to 1.0 which will result in lse/out = 0.0 | after the log(li) + mi(0.0) step + l_i = tl.where(l_i == 0.0, 1, l_i) + + acc = acc / l_i[:, None] + idx_z = tl.program_id(1) // HQ + idx_hq = tl.program_id(1) % HQ + idx_m = offs_m[:, None] + idx_d = tl.arange(0, V_HEAD_DIM)[None, :] + + mask = idx_m < Q_LEN + # TODO generalize and add proper mask support + {{store_output(("idx_z", "idx_hq", "idx_m", "idx_d"), "acc", "mask")}} + + # TODO dont want to write this if we dont require grad + if OUTPUT_LOGSUMEXP: + off_hz = tl.program_id(1) + l_ptrs = LSE + off_hz * Q_LEN + offs_m + lse = m_i + tl.math.log2(l_i) + if IS_DIVISIBLE: + tl.store(l_ptrs, lse) + else: + tl.store(l_ptrs, lse, mask=offs_m < Q_LEN) + """ + + +compute_forward_inner = r""" +@triton.jit +def forward_inner( + {{gen_argdefs()}}, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + # accumulated values + acc, l_i, m_i, + # Offsets used as inputs to score_mod & mask_mod + # of size [BLOCK_M, BLOCK_N] or scalar. + off_z, off_h, offs_m, offs_n, + # blocksparse data + kv_indices, kv_num_blocks, + # start kv and end kv block + block_n_start, block_n_end, + MATMUL_PRECISION, + IS_FULL_BLOCKS, +): + # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through + {{gen_defines() | indent_except_first(1)}} + + SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N) + RCP_LN2: tl.constexpr = 1.44269504 + + if PRESCALE_QK: + q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION) + + # loop over k, v and update accumulator until block_n_end + for start_n in range(block_n_start, block_n_end): + if IS_DIVISIBLE: + acc, l_i, m_i = forward_block_mn( + {{gen_argdefs()}}, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + # accumulated values + acc, l_i, m_i, + # Offsets + off_z, off_h, offs_m, offs_n, + MATMUL_PRECISION, RCP_LN2, + IS_FULL_BLOCKS, + ) + else: + # Benchmark shows even we applied mod & mask to each block for non divisible seqlen, + # it's on par or slightly faster than only applying to the last block in fwd. + # However, we choose different strategy for bwd, where we only apply mod & mask + # to the last block because it's faster a lot. + acc, l_i, m_i = forward_block_mn( + {{gen_argdefs()}}, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + # accumulated values + acc, l_i, m_i, + # Offsets + off_z, off_h, offs_m, offs_n, + MATMUL_PRECISION, RCP_LN2, + IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True, + ) + + # update pointers + offset = get_offset_for_next_block( + start_n, kv_indices, kv_num_blocks, + SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N + ) + + V_block_ptr = tl.advance(V_block_ptr, (offset, 0)) + K_block_ptr = tl.advance(K_block_ptr, (0, offset)) + + offs_n = offs_n + offset + + return acc, l_i, m_i + +""" + + +compute_forward_block_mn = r""" +@triton.jit +def forward_block_mn( + {{gen_argdefs()}}, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + # accumulated values + acc, l_i, m_i, + # Offsets + off_z, off_h, offs_m, offs_n, + MATMUL_PRECISION, RCP_LN2, + IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False, +): + # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through + {{gen_defines() | indent_except_first(1)}} + + # -- load k -- + if IS_DIVISIBLE: + k = tl.load(K_block_ptr) + else: + k = tl.load(K_block_ptr, boundary_check=(1,), padding_option = "zero") + # -- compute qk --- + qk = tl.dot(q, k, input_precision=FLOAT32_PRECISION) # TODO: use cuda matmul when q_len <= 2. + if not PRESCALE_QK: + qk *= SM_SCALE + # ~~~~~~~~~~~~~~~~~~~ Apply score modification ~~~~~~~~~~~~~~~~~~~ + if CHECK_BLOCK_BOUNDARY: + # If this is the last block of a non divisible seqlen, we still need to load [BLOCK_M, BLOCK_N] elements, + # which is larger than the actual number of elements. To avoid access memory out of bound, + # we need to mask out the elements that are out of Q_LEN & KV_LEN. + m = offs_m % Q_LEN + n = offs_n % KV_LEN + else: + m = offs_m + n = offs_n + + {{ modification( + subgraph_number=0, + output_name="post_mod_scores", + score="qk", + b="off_z", + h="off_h", + m="m", + n="n", + out="qk" + ) | indent_except_first(1) }} + + if CHECK_BLOCK_BOUNDARY: + # Mask out the elements that are out of the KV_LEN for non divisible seqlen. + post_mod_scores = tl.where(offs_n < KV_LEN, post_mod_scores, float("-inf")) + + if not IS_FULL_BLOCKS: + {{ modification( + subgraph_number=1, + output_name="mask_mod_output", + score="qk", + b="off_z", + h="off_h", + m="m", + n="n", + ) | indent_except_first(2) }} + + if CHECK_BLOCK_BOUNDARY: + mask_mod_output = tl.where(offs_n < KV_LEN, mask_mod_output, float("-inf")) + # apply mask for partially unmasked blocks + post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf")) + + # TODO: In the case that score_mod is linear, this can be LICMed + if not PRESCALE_QK: + post_mod_scores *= RCP_LN2 + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + # -- compute scaling constant --- + m_ij = tl.maximum(m_i, tl.max(post_mod_scores, 1)) + if not ROWS_GUARANTEED_SAFE: + masked_out_rows = (m_ij == float("-inf")) + m_ij_masked = tl.where(masked_out_rows, 0, m_ij) + else: + m_ij_masked = m_ij + + alpha = tl.math.exp2(m_i - m_ij_masked) + p = tl.math.exp2(post_mod_scores - m_ij_masked[:, None]) + + # NB: l_i update is pulled up here since it's a bit faster + # NB: For headdim=256, it's faster to move it back down to after m_i = + # m_ij + l_i = l_i * alpha + tl.sum(p, 1) + # # -- scale and update acc -- + acc = acc * alpha[:, None] + + if IS_DIVISIBLE: + v = tl.load(V_block_ptr) + else: + v = tl.load(V_block_ptr, boundary_check=(0,), padding_option = "zero") + acc = tl.dot(p.to(MATMUL_PRECISION), v, acc, input_precision=FLOAT32_PRECISION) + + # -- update m_i + m_i = m_ij + + return acc, l_i, m_i + +""" + + +flex_attention_template = TritonTemplate( + name="flex_attention", + grid=flex_attention_grid, + source=compute_flex_attention + + compute_forward_inner + + compute_next_offset_func + + compute_forward_block_mn, +) + + +def _use_flex_decoding(query, kernel_options): + # Decide which kernel to use, return true if use flex decoding kernel. + return ( + not kernel_options.get("FORCE_USE_FLEX_ATTENTION", False) + ) and V.graph.sizevars.evaluate_expr(sympy.Lt(query.get_size()[-2], 128)) + + +_h100_default_config = { + (torch.float32, 64): (128, 32, 4, 3), + (torch.float32, 128): (32, 64, 4, 3), + (torch.float32, 256): (32, 32, 4, 3), + (torch.bfloat16, 64): (128, 128, 4, 3), + (torch.bfloat16, 128): (128, 64, 8, 3), + (torch.bfloat16, 256): (64, 32, 4, 3), + (torch.float16, 64): (128, 128, 4, 3), + (torch.float16, 128): (128, 128, 8, 3), + (torch.float16, 256): (64, 32, 4, 3), +} + +_a100_default_config = { + (torch.float32, 64): (128, 32, 4, 3), + (torch.float32, 128): (128, 32, 4, 3), + (torch.float32, 256): (64, 16, 4, 3), + (torch.bfloat16, 64): (128, 64, 4, 3), + (torch.bfloat16, 128): (128, 64, 8, 3), + (torch.bfloat16, 256): (32, 64, 4, 3), + (torch.float16, 64): (128, 64, 4, 3), + (torch.float16, 128): (128, 64, 8, 3), + (torch.float16, 256): (32, 64, 4, 3), +} + + +def _get_default_config_fwd(query) -> Tuple[int, int, int, int]: + dtype = query.get_dtype() + head_dim = query.get_size()[-1] + default_config = None + + if head_dim <= 256 and torch.cuda.get_device_capability() >= (9, 0): # H100 + if dtype == torch.float32: + default_config = (64, 64, 4, 3) + else: + default_config = (128, 64, 4, 3) + default_config = _h100_default_config.get((dtype, head_dim), default_config) + elif head_dim <= 256 and torch.cuda.get_device_capability() >= (8, 0): # A100 + if dtype == torch.float32: + default_config = (64, 64, 4, 3) + else: + default_config = (128, 64, 4, 3) + default_config = _a100_default_config.get((dtype, head_dim), default_config) + else: # modest hardware or extremely large head_dim + if dtype == torch.float32: + default_config = (32, 16, 4, 3) + else: + default_config = (64, 32, 4, 3) + + return default_config + + +def _get_default_config_bwd(query) -> Tuple[int, int, int, int]: + head_dim = query.get_size()[-1] + dtype = query.get_dtype() + + if dtype == torch.float32: + return (16, 16, 4, 1) + if head_dim <= 256 and torch.cuda.get_device_capability() >= (9, 0): # H100 + if head_dim == 64: + return (64, 64, 4, 3) + elif head_dim == 128: + return (64, 128, 8, 3) + else: + return (64, 64, 4, 2) + elif torch.cuda.get_device_capability() >= (8, 0): # A100 + if head_dim == 64: + return (32, 128, 4, 3) + elif head_dim == 128: + return (64, 128, 8, 3) + else: + return (64, 64, 4, 2) + else: # modest hardware or extremely large head_dim + return (16, 16, 4, 1) + + +def create_num_blocks_fake_generator(sparse_indices): + # The idea here is that we need to create a real tensor with real data + # that's representative for benchmarking. + # For example, returning all zeros for the `kv_num_blocks` input would mean + # that we are computing 0 blocks for each row, which would provide bogus + # autotuning results. + # + # In this case, we choose to use min(16, max_block) blocks, because I + # (Horace) think it'll probably result in pretty representative performance. + # If it's too short then prefetching won't help. If it's too long then + # autotuning will take longer for no good reason. + def create_num_blocks_fake(x) -> torch.Tensor: + num_blocks_for_autotuning = min(16, sparse_indices.shape[-1]) + return torch.full( + x.get_size(), + int(num_blocks_for_autotuning), + dtype=x.get_dtype(), + device=x.get_device(), + ) + + return create_num_blocks_fake + + +def create_indices_fake(x) -> torch.Tensor: + indices = torch.arange( + 0, int(x.get_size()[-1]), dtype=x.get_dtype(), device=x.get_device() + ) + indices = indices.expand(x.get_size()).contiguous() + return indices + + +from torch._inductor.kernel.flex_decoding import create_flex_decoding_kernel + + +# TODO: We probably also need a layout constraint? +@register_lowering(torch.ops.higher_order.flex_attention, type_promotion_kind=None) +def flex_attention( + query, + key, + value, + subgraph, + block_mask, + scale, + kernel_options, + score_mod_other_buffers, + mask_mod_other_buffers, +): + ( + kv_num_blocks, + kv_indices, + full_kv_num_blocks, + full_kv_indices, + q_num_blocks, + q_indices, + full_q_num_blocks, + full_q_indices, + SPARSE_KV_BLOCK_SIZE, + SPARSE_Q_BLOCK_SIZE, + mask_graph, + ) = block_mask + placeholder_inps = [ + create_placeholder(name, dtype, query.get_device()) + for name, dtype in [ + ("score", query.get_dtype()), + ("b", torch.int32), + ("h", torch.int32), + ("m", torch.int32), + ("n", torch.int32), + ] + ] + subgraph_buffer = build_subgraph_buffer( + placeholder_inps + list(score_mod_other_buffers), subgraph + ) + mask_graph_placeholder_inps = [ + create_placeholder(name, dtype, query.get_device()) + for name, dtype in [ + ("b", torch.int32), + ("h", torch.int32), + ("m", torch.int32), + ("n", torch.int32), + ] + ] + mask_graph_buffer = build_subgraph_buffer( + mask_graph_placeholder_inps + list(mask_mod_other_buffers), mask_graph + ) + kernel_options = dict(kernel_options) + kernel_options.setdefault("FLOAT32_PRECISION", get_float32_precision()) + if _use_flex_decoding(query, kernel_options): + return create_flex_decoding_kernel( + query, + key, + value, + block_mask, + scale, + kernel_options, + subgraph_buffer, + mask_graph_buffer, + score_mod_other_buffers, + mask_mod_other_buffers, + ) + + ( + query, + key, + value, + kv_num_blocks, + kv_indices, + full_kv_num_blocks, + full_kv_indices, + q_num_blocks, + q_indices, + full_q_num_blocks, + full_q_indices, + ) = maybe_realize( + [ + query, + key, + value, + kv_num_blocks, + kv_indices, + full_kv_num_blocks, + full_kv_indices, + q_num_blocks, + q_indices, + full_q_num_blocks, + full_q_indices, + ] + ) + + Bq, Hq, seq_len_q, qk_head_dim = query.get_size() + Bkv, Hkv, seq_len_kv, v_head_dim = value.get_size() + assert Bq == Bkv, "Batch dimension must match" + B = Bq + + if seq_len_q % 128 != 0 or seq_len_kv % 128 != 0: + kernel_options.setdefault("IS_DIVISIBLE", False) + else: + kernel_options.setdefault("IS_DIVISIBLE", True) + + # Reuse query strides for output layout despite different last dimension. + # This works because only the last dim differs and we check it is contiguous. + q_strides = query.get_stride() + assert q_strides[-1] == 1, "Query must be contiguous in the last dimension" + + # Construct output layout with strides matching the query. + out_size = [B, Hq, seq_len_q, v_head_dim] + stride_order = get_stride_order(query.get_stride()) + fill_order = stride_order2fill_order(stride_order) + out_strides = construct_strides(out_size, fill_order) + + layout = FixedLayout( + query.get_device(), + query.get_dtype(), + [B, Hq, seq_len_q, v_head_dim], + stride=out_strides, + ) + # see NOTE:[TritonTemplates with multiple outputs] + logsumexp_shape = [B, Hq, seq_len_q] + logsumexp = empty_strided( + logsumexp_shape, + None, + dtype=torch.float32, # The logsumexp is always stored in fp32 regardless of the input dtype + device=query.get_device(), + ) + kernel_options.setdefault("SM_SCALE", scale) + + # Determine GQA broadcast factor. + gqa_shared_heads = Hq // Hkv + kernel_options.setdefault("GQA_SHARED_HEADS", gqa_shared_heads) + + # Inside of Triton kernel, only apply partial masking if partial blocks are computed. + # full_kv_num_blocks is None if partial blocks are not computed + has_full_blocks = full_kv_num_blocks is not None + kernel_options.setdefault("HAS_FULL_BLOCKS", has_full_blocks) + if not has_full_blocks: + full_kv_num_blocks, full_kv_indices = ( + empty(0, device=query.get_device()) for _ in range(2) + ) + kernel_options.setdefault("QK_HEAD_DIM", qk_head_dim) + kernel_options.setdefault("V_HEAD_DIM", v_head_dim) + + choices: List[Any] = [] + configs: List[Tuple[int, int, int, int]] = [] + configs.append(_get_default_config_fwd(query)) + if config.max_autotune: + configs += [ + (128, 64, 4, 3), + (128, 128, 4, 3), + (128, 128, 8, 2), + (64, 128, 4, 3), + (64, 64, 4, 3), + ] + + # Note, we don't need to pass in the captured buffers explicitly + # because they're implicitly added by the score_mod function + # We do need to explicitly pass it in for autotuning though. + + for BLOCK_M, BLOCK_N, num_warps, num_stages in configs: + if SPARSE_KV_BLOCK_SIZE % BLOCK_N != 0 or SPARSE_Q_BLOCK_SIZE % BLOCK_M != 0: + continue + # Work around https://github.com/pytorch/pytorch/issues/129625 + if num_stages == 2: + continue + + # Performance tuning + kernel_options.setdefault("BLOCK_M", BLOCK_M) + kernel_options.setdefault("BLOCK_N", BLOCK_N) + # Blocksparse options + kernel_options.setdefault("SPARSE_Q_BLOCK_SIZE", SPARSE_Q_BLOCK_SIZE) + kernel_options.setdefault("SPARSE_KV_BLOCK_SIZE", SPARSE_KV_BLOCK_SIZE) + + flex_attention_template.maybe_append_choice( + choices=choices, + input_nodes=[ + query, + key, + value, + logsumexp, + kv_num_blocks, + kv_indices, + full_kv_num_blocks, + full_kv_indices, + ], + layout=layout, + subgraphs=[ + subgraph_buffer, + mask_graph_buffer, + ], + mutated_inputs=[ + logsumexp, + ], + num_stages=num_stages, + num_warps=num_warps, + call_sizes=query.get_size(), + **kernel_options, + ) + inputs_for_autotuning = ( + [ + query, + key, + value, + logsumexp, + kv_num_blocks, + kv_indices, + full_kv_num_blocks, + full_kv_indices, + ] + + list(score_mod_other_buffers) + + list(mask_mod_other_buffers) + ) + input_gen_fns = { + 4: create_num_blocks_fake_generator(kv_indices), + 5: create_indices_fake, + 6: create_num_blocks_fake_generator(full_kv_indices), + 7: create_indices_fake, + } + return ( + autotune_select_algorithm( + "flex_attention", + choices, + inputs_for_autotuning, + layout, + input_gen_fns=input_gen_fns, + ), + logsumexp, + ) + + +# ---------------------------- Backward HOP Implementation ---------------------------- + + +def flex_attention_backward_grid( + batch_size, q_heads, num_queries, d_model, kv_heads, num_key_value, meta +): + """How is this kernel parallelized? + Currently this is only parallelizing over batch* kv_heads, but we can, and want to + parallelize over ceil_div(q_heads//kv_heads * num_key_value, key_value_block_size). + To do this will either require atomic updates to some grad values or to have a two pass kernel design. + """ + import triton + + return ( + triton.cdiv(num_queries, meta["BLOCK_M2"]) * (q_heads // kv_heads) + + triton.cdiv(num_key_value, meta["BLOCK_N1"]), + 1, + batch_size * kv_heads, + ) + + +flex_attention_backward_template = TritonTemplate( + name="flex_attention_backward", + grid=flex_attention_backward_grid, + source=r""" +{{def_kernel("Q", "K", "V", "LSE", "DELTA", "DO", "DQ", "DV", "KV_NUM_BLKS", "KV_IDX", "Q_NUM_BLKS", "Q_IDX", "FULL_KV_NUM_BLKS", "FULL_KV_IDX", "FULL_Q_NUM_BLKS", "FULL_Q_IDX")}} + # Sub notation for this kernel: + # + # Q: Query, K: Key, V: Value + # LSE: logsumexp (logsumexp is always stored in fp32 regardless of the input dtype) + # DELTA: Precomputed sum(OUT*DO, axis=-1) + # DO: Derivative of Output, DQ: Derivative of Query, DV: Derivative of Value + # DK: Derivative of Key, is the written to via the store_output call due to some limitations with + # inductor codegen + # M: Number of queries, N: Number of keys/values + # QK_HEAD_DIM: The dimension of the query and key embeddings + # V_HEAD_DIM: The dimension of the value embeddings + # z: Batch size, h: Number of heads, m: Number of queries or keys/values, d: Head dim + # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups. + # (Modifiable) Performance tuning options + # BLOCK_M1: when calculating DK & DV, iterate over BLOCK_M1 across the seqlen dim of Q in each thread block. + # BLOCK_N1: when calculating DK & DV, the thread block size across the seqlen dim of K/V. + # BLOCK_M2: when calculating DQ, the thread block size across the seqlen dim of Q. + # BLOCK_N2: when calculating DQ, iterate over BLOCK_N2 across the seqlen dim of K/V in each thread block. + # + # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid. + # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query. + # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query. + # Q_NUM_BLKS: The number of Q blocks (that may or may not require masking) for each query. + # Q_IDX: The indices of Q blocks (that may or may not require masking) for each query. + # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query. + # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query. + # FULL_Q_NUM_BLKS: The number of fully unmasked Q blocks (so we don't need masking) for each query. + # FULL_Q_IDX: The indices of fully unmasked Q blocks (so we don't need masking) for each query. + + # The below are kernel options that can be applied for certain score_mods, + # or involve a numerics vs. perf tradeoff + # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has + # about 20% more numerical error, but slightly faster. + + # Define strides of inputs + stride_qz, stride_qh, stride_qm, stride_qd = {{stride("Q")}} + stride_kz, stride_kh, stride_kn, stride_kd = {{stride("K")}} + stride_vz, stride_vh, stride_vn, stride_vd = {{stride("V")}} + stride_doz, stride_doh, stride_dom, stride_dod = {{stride("DO")}} + + stride_dqz, stride_dqh, stride_dqm, stride_dqd = {{stride("DQ")}} + stride_dvz, stride_dvh, stride_dvm, stride_dvd = {{stride("DV")}} + + Z = {{size("Q", 0)}} + HQ = {{size("Q", 1)}} + HKV = {{size("K", 1)}} + Q_LEN = {{size("Q", 2)}} + KV_LEN = {{size("K", 2)}} + + MATMUL_PRECISION = Q.dtype.element_ty + + pid = tl.program_id(0) + NUM_KV_BLOCKS = tl.cdiv(KV_LEN, BLOCK_N1) + NUM_Q_BLOCKS = tl.cdiv(Q_LEN, BLOCK_M2) + + off_hz = tl.program_id(2) + off_z = off_hz // HKV # batch idx + off_hkv = off_hz % HKV # kv head idx + + SPARSE_Z = {{size("KV_NUM_BLKS", 0)}} + SPARSE_HQ = {{size("KV_NUM_BLKS", 1)}} + + sparse_idx_z = off_z % SPARSE_Z + + k_adj = (stride_kh * off_hkv + stride_kz * off_z).to(tl.int64) + v_adj = (stride_vh * off_hkv + stride_vz * off_z).to(tl.int64) + dv_adj = (stride_dvh * off_hkv + stride_dvz * off_z).to(tl.int64) + + # offset K, V, DV pointers for batch/kv-head + K += k_adj + V += v_adj + DV += dv_adj + + RCP_LN2 = 1.44269504 + offs_k = tl.arange(0, QK_HEAD_DIM) + offs_v = tl.arange(0, V_HEAD_DIM) + + if pid >= NUM_KV_BLOCKS: + off_pid = pid - NUM_KV_BLOCKS + # THIS BLOCK DOES DQ + SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M2) + SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2) + off_hq2 = off_pid // NUM_Q_BLOCKS + off_hkv * GQA_SHARED_HEADS + start_m2_block = off_pid % NUM_Q_BLOCKS + off_pid_mask = start_m2_block // SPARSE_Q_MULTIPLE + stride_kv_num_blks_h = {{stride("KV_NUM_BLKS", 1)}} + stride_kv_idx_h = {{stride("KV_IDX", 1)}} + stride_kv_idx_m = {{stride("KV_IDX", 2)}} + + sparse_idx_hq2 = off_hq2 % SPARSE_HQ + sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq2 + + sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + off_pid_mask + sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + off_pid_mask * stride_kv_idx_m # noqa: B950 + + # Offset Q, DQ, DO, DELTA & LSE. These inputs are offseted by query heads. + q_adj2 = (stride_qh * off_hq2 + stride_qz * off_z).to(tl.int64) + do_adj2 = (stride_doh * off_hq2 + stride_doz * off_z).to(tl.int64) + dq_adj2 = (stride_dqh * off_hq2 + stride_dqz * off_z).to(tl.int64) + off_chz2 = ((off_z * HQ + off_hq2) * Q_LEN).to(tl.int64) + + Q2 = Q + q_adj2 + DO2 = DO + do_adj2 + # TODO: This does not work if DQ is not the same layout as Q (for example, + # if Q is broadcasted) + DQ2 = DQ + dq_adj2 + LSE2 = LSE + off_chz2 + DELTA2 = DELTA + off_chz2 + + dq = tl.zeros([BLOCK_M2, QK_HEAD_DIM], dtype=tl.float32) + + start_m2 = start_m2_block * BLOCK_M2 + offs_m2 = start_m2 + tl.arange(0, BLOCK_M2) + + # load Q and do: they stay in SRAM throughout the inner loop. + if IS_DIVISIBLE: + q = tl.load(Q2 + offs_m2[:, None] * stride_qm + offs_k[None, :] * stride_qd) + do = tl.load(DO2 + offs_m2[:, None] * stride_dom + offs_v[None, :] * stride_dod) + else: + q = tl.load(Q2 + offs_m2[:, None] * stride_qm + offs_k[None, :] * stride_qd, mask=offs_m2[:, None] < Q_LEN) + do = tl.load(DO2 + offs_m2[:, None] * stride_dom + offs_v[None, :] * stride_dod, mask=offs_m2[:, None] < Q_LEN) + + if PRESCALE_QK: + q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION) + + if IS_DIVISIBLE: + Di = tl.load(DELTA2 + offs_m2) + lse = tl.load(LSE2 + offs_m2) + else: + Di = tl.load(DELTA2 + offs_m2, mask=offs_m2 < Q_LEN) + lse = tl.load(LSE2 + offs_m2, mask=offs_m2 < Q_LEN) + lse = tl.where(lse == -float("inf"), 0.0, lse) + lse = lse[:, None] + + # ~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # KV_IDX and KV_NUM_BLKS are always contiguous. + kv_indices = KV_IDX + sparse_kv_idx_offset + kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading + sparse_kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset) + + offs_n2 = kv_start + tl.arange(0, BLOCK_N2) + dq = bwd_dq_inner( + {{gen_argdefs()}}, + K, V, + dq, q, do, Di, lse, + off_z, off_hq2, offs_m2, offs_n2, + stride_kn, stride_kd, stride_vn, stride_vd, + kv_indices, sparse_kv_num_blocks, + MATMUL_PRECISION, + IS_FULL_BLOCKS=False, + ) + + if HAS_FULL_BLOCKS: + # ~~~~~~~~~~~ partial unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous. + kv_indices = FULL_KV_IDX + sparse_kv_idx_offset + kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading + sparse_kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset) + + offs_n2 = kv_start + tl.arange(0, BLOCK_N2) + dq = bwd_dq_inner( + {{gen_argdefs()}}, + K, V, + dq, q, do, Di, lse, + off_z, off_hq2, offs_m2, offs_n2, + stride_kn, stride_kd, stride_vn, stride_vd, + kv_indices, sparse_kv_num_blocks, + MATMUL_PRECISION, + IS_FULL_BLOCKS=True, + ) + + # Write back dQ. + dq_ptrs = DQ2 + offs_m2[:, None] * stride_dqm + offs_k[None, :] * stride_dqd + dq *= SM_SCALE + if IS_DIVISIBLE: + tl.store(dq_ptrs, dq) + else: + tl.store(dq_ptrs, dq, mask=offs_m2[:, None] < Q_LEN) + else: + # THIS BLOCK DOES DK & DV + SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1) + SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N1) + + pid_mask = pid // SPARSE_KV_MULTIPLE + + stride_q_num_blks_h = {{stride("Q_NUM_BLKS", 1)}} + stride_q_idx_h = {{stride("Q_IDX", 1)}} + stride_q_idx_n = {{stride("Q_IDX", 2)}} + + dv = tl.zeros([BLOCK_N1, V_HEAD_DIM], dtype=tl.float32) + dk = tl.zeros([BLOCK_N1, QK_HEAD_DIM], dtype=tl.float32) + + start_n1 = pid * BLOCK_N1 + offs_n1 = start_n1 + tl.arange(0, BLOCK_N1) + + # load K and V: they stay in SRAM throughout the inner loop. + if IS_DIVISIBLE: + k = tl.load(K + offs_n1[:, None] * stride_kn + offs_k[None, :] * stride_kd) + v = tl.load(V + offs_n1[:, None] * stride_vn + offs_v[None, :] * stride_vd) + else: + k = tl.load(K + offs_n1[:, None] * stride_kn + offs_k[None, :] * stride_kd, mask=offs_n1[:, None] < KV_LEN) + v = tl.load(V + offs_n1[:, None] * stride_vn + offs_v[None, :] * stride_vd, mask=offs_n1[:, None] < KV_LEN) + if PRESCALE_QK: + k = (k * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION) + + for off_g in range(0, GQA_SHARED_HEADS): + off_hq1 = off_hkv * GQA_SHARED_HEADS + off_g + + # Offset Q, DQ, DO, DELTA & LSE. These inputs are offseted by query heads. + q_adj1 = (stride_qh * off_hq1 + stride_qz * off_z).to(tl.int64) + do_adj1 = (stride_doh * off_hq1 + stride_doz * off_z).to(tl.int64) + dq_adj1 = (stride_dqh * off_hq1 + stride_dqz * off_z).to(tl.int64) + off_chz1 = ((off_z * HQ + off_hq1) * Q_LEN).to(tl.int64) + + Q1 = Q + q_adj1 + DO1 = DO + do_adj1 + # TODO: This does not work if DQ is not the same layout as Q (for example, + # if Q is broadcasted) + LSE1 = LSE + off_chz1 + DELTA1 = DELTA + off_chz1 + + sparse_idx_hq1 = off_hq1 % SPARSE_HQ + sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq1 + + sparse_q_num_blks_offset = sparse_hz_offset * stride_q_num_blks_h + pid_mask + sparse_q_idx_offset = sparse_hz_offset * stride_q_idx_h + pid_mask * stride_q_idx_n # noqa: B950 + + # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # Q_IDX and Q_NUM_BLKS are always contiguous. + q_indices = Q_IDX + sparse_q_idx_offset + q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading + sparse_q_num_blocks = tl.load(Q_NUM_BLKS + sparse_q_num_blks_offset) + + offs_m1 = q_start + tl.arange(0, BLOCK_M1) + dk, dv = bwd_dkdv_inner( + {{gen_argdefs()}}, + Q1, DO1, DELTA1, LSE1, + dk, dv, k, v, + off_z, off_hq1, offs_n1, offs_m1, + stride_qm, stride_qd, stride_dom, stride_dod, + q_indices, sparse_q_num_blocks, + MATMUL_PRECISION, + IS_FULL_BLOCKS=False, + ) + + + if HAS_FULL_BLOCKS: + # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # FULL_Q_IDX and FULL_Q_NUM_BLKS are always contiguous. + q_indices = FULL_Q_IDX + sparse_q_idx_offset + q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading + sparse_q_num_blocks = tl.load(FULL_Q_NUM_BLKS + sparse_q_num_blks_offset) + + offs_m1 = q_start + tl.arange(0, BLOCK_M1) + dk, dv = bwd_dkdv_inner( + {{gen_argdefs()}}, + Q1, DO1, DELTA1, LSE1, + dk, dv, k, v, + off_z, off_hq1, offs_n1, offs_m1, + stride_qm, stride_qd, stride_dom, stride_dod, + q_indices, sparse_q_num_blocks, + MATMUL_PRECISION, + IS_FULL_BLOCKS=True, + ) + + # Write back dV and dK. + dv_ptrs = DV + offs_n1[:, None] * stride_dvm + offs_v[None, :] * stride_dvd + + index_n = offs_n1[:, None] + index_k = offs_k[None, :] + + if IS_DIVISIBLE: + tl.store(dv_ptrs, dv) + else: + tl.store(dv_ptrs, dv, mask=index_n < KV_LEN) + + dk *= SM_SCALE + mask = index_n < KV_LEN + {{store_output(("off_z", "off_hkv", "index_n", "index_k"), "dk", "mask", indent_width=8)}} + +@triton.jit +def bwd_dq_inner( + {{gen_argdefs()}}, + K, V, # pointers + dq, q, do, Di, lse, + off_z, off_hq, offs_m2, offs_n2, + stride_kn, stride_kd, stride_vn, stride_vd, + kv_indices, sparse_kv_num_blocks, + MATMUL_PRECISION, + IS_FULL_BLOCKS, +): + {{gen_defines() | indent_except_first(1) }} + SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2) + RCP_LN2: tl.constexpr = 1.44269504 + Q_LEN = {{size("Q", 2)}} + KV_LEN = {{size("K", 2)}} + + offs_k = tl.arange(0, QK_HEAD_DIM) + offs_v = tl.arange(0, V_HEAD_DIM) + + kT_ptrs = K + offs_n2[None, :] * stride_kn + offs_k[:, None] * stride_kd + vT_ptrs = V + offs_n2[None, :] * stride_vn + offs_v[:, None] * stride_vd + # BLOCK_M2 must be a multiple of BLOCK_N2, otherwise the code wouldn't work. + tl.static_assert(BLOCK_M2 % BLOCK_N2 == 0) + + hi = tl.minimum(sparse_kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N2), 1)) + if not IS_DIVISIBLE: + if hi >= 1: + for start_n in range(0, hi - 1): + dq = bwd_dq_block_mn( + {{gen_argdefs()}}, + dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN, + off_z, off_hq, offs_m2, offs_n2, + stride_kn, stride_kd, stride_vn, stride_vd, + kv_indices, sparse_kv_num_blocks, + MATMUL_PRECISION, RCP_LN2, + IS_FULL_BLOCKS, + ) + + # Increment pointers. + offset = get_offset_for_next_block( + start_n, kv_indices, sparse_kv_num_blocks, + SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2 + ) + + kT_ptrs += offset * stride_kn + vT_ptrs += offset * stride_vn + + offs_n2 += offset + + dq = bwd_dq_block_mn( + {{gen_argdefs()}}, + dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN, + off_z, off_hq, offs_m2, offs_n2, + stride_kn, stride_kd, stride_vn, stride_vd, + kv_indices, sparse_kv_num_blocks, + MATMUL_PRECISION, RCP_LN2, + IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True, + ) + else: + for start_n in range(0, hi): + dq = bwd_dq_block_mn( + {{gen_argdefs()}}, + dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN, + off_z, off_hq, offs_m2, offs_n2, + stride_kn, stride_kd, stride_vn, stride_vd, + kv_indices, sparse_kv_num_blocks, + MATMUL_PRECISION, RCP_LN2, + IS_FULL_BLOCKS, + ) + + # Increment pointers. + offset = get_offset_for_next_block( + start_n, kv_indices, sparse_kv_num_blocks, + SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2 + ) + + kT_ptrs += offset * stride_kn + vT_ptrs += offset * stride_vn + + offs_n2 += offset + + return dq + + +@triton.jit +def bwd_dq_block_mn( + {{gen_argdefs()}}, + dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN, + off_z, off_hq, offs_m2, offs_n2, + stride_kn, stride_kd, stride_vn, stride_vd, + kv_indices, sparse_kv_num_blocks, + MATMUL_PRECISION, RCP_LN2, + IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False, +): + {{gen_defines() | indent_except_first(1)}} + + if IS_DIVISIBLE: + kT = tl.load(kT_ptrs) + else: + kT = tl.load(kT_ptrs, mask=offs_n2[None, :] < KV_LEN) + qk = tl.dot(q, kT, input_precision=FLOAT32_PRECISION) + if not PRESCALE_QK: + qk *= SM_SCALE + # ~~~~~~~~~~~~~~~~~~~ Apply score modification ~~~~~~~~~~~~~~~~~~~ + pre_mod_scores = qk + if CHECK_BLOCK_BOUNDARY: + m = offs_m2[:, None] % Q_LEN + n = offs_n2[None, :] % KV_LEN + else: + m = offs_m2[:, None] + n = offs_n2[None, :] + {{ modification( + subgraph_number=0, + output_name="post_mod_scores", + score="qk", + b="off_z", + h="off_hq", + m="m", + n="n", + out="qk" + ) | indent_except_first(1) }} + + if CHECK_BLOCK_BOUNDARY: + # Mask out the elements that are out of the KV_LEN for non divisible seqlen. + post_mod_scores = tl.where(offs_n2[None, :] < KV_LEN, post_mod_scores, float("-inf")) + + if not IS_FULL_BLOCKS: + {{ modification( + subgraph_number=2, + output_name="mask_mod_output", + score="qk", + b="off_z", + h="off_hq", + m="m", + n="n", + ) | indent_except_first(2) }} + + if CHECK_BLOCK_BOUNDARY: + mask_mod_output = tl.where(offs_n2[None, :] < KV_LEN, mask_mod_output, float("-inf")) + # apply mask for partial masked block + post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf")) + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + if not PRESCALE_QK: + post_mod_scores *= RCP_LN2 + p = tl.math.exp2(post_mod_scores - lse) + # Compute dP and dS. + if IS_DIVISIBLE: + vT = tl.load(vT_ptrs) + else: + vT = tl.load(vT_ptrs, mask=offs_n2[None, :] < KV_LEN) + dp = tl.dot(do, vT, input_precision=FLOAT32_PRECISION) + ds = p * (dp - Di[:, None]) + # ~~~~~~~~~~~~~~~~~~~ Apply joint modification ~~~~~~~~~~~~~~~~~~~ + {{ modification( + subgraph_number=1, + output_name = "grad_scores", + score="pre_mod_scores", + b="off_z", + h="off_hq", + m="m", + n="n", + grad_score_mod="ds" + ) | indent_except_first(1) }} + if CHECK_BLOCK_BOUNDARY: + grad_scores = tl.where(offs_n2[None, :] < KV_LEN, grad_scores, 0.0) + + ds = grad_scores + + if not IS_FULL_BLOCKS: + if CHECK_BLOCK_BOUNDARY: + mask_mod_output = tl.where(offs_n2[None, :] < KV_LEN, mask_mod_output, float("-inf")) + # (grads) apply mask for partially unmasked block + ds = tl.where(mask_mod_output, ds, 0.0) + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ds = ds.to(MATMUL_PRECISION) + # Compute dQ. + dq += tl.dot(ds, tl.trans(kT), input_precision=FLOAT32_PRECISION) + + return dq + + +@triton.jit +def bwd_dkdv_inner( + {{gen_argdefs()}}, + Q, DO, DELTA, LSE, # pointers + dk, dv, k, v, + off_z, off_hq, offs_n1, offs_m1, + stride_qm, stride_qd, stride_dom, stride_dod, + q_indices, sparse_q_num_blocks, + MATMUL_PRECISION, + IS_FULL_BLOCKS, +): + {{gen_defines() | indent_except_first(1) }} + SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1) + RCP_LN2: tl.constexpr = 1.44269504 + Q_LEN = {{size("Q", 2)}} + KV_LEN = {{size("K", 2)}} + + offs_k = tl.arange(0, QK_HEAD_DIM) + offs_v = tl.arange(0, V_HEAD_DIM) + + qT_ptrs = Q + offs_m1[None, :] * stride_qm + offs_k[:, None] * stride_qd + do_ptrs = DO + offs_m1[:, None] * stride_dom + offs_v[None, :] * stride_dod + # BLOCK_N1 must be a multiple of BLOCK_M1, otherwise the code wouldn't work. + tl.static_assert(BLOCK_N1 % BLOCK_M1 == 0) + hi = tl.minimum(sparse_q_num_blocks * SPARSE_Q_MULTIPLE, tl.maximum(tl.cdiv(Q_LEN, BLOCK_M1), 1)) + + if not IS_DIVISIBLE: + if hi >= 1: + for start_m in range(0, hi - 1): + dk, dv = bwd_dkdv_block_mn( + {{gen_argdefs()}}, + dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN, + off_z, off_hq, offs_n1, offs_m1, + stride_qm, stride_qd, stride_dom, stride_dod, + q_indices, sparse_q_num_blocks, + MATMUL_PRECISION, RCP_LN2, + IS_FULL_BLOCKS, + ) + # Increment pointers. + offset = get_offset_for_next_block( + start_m, q_indices, sparse_q_num_blocks, + SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1 + ) + + qT_ptrs += offset * stride_qm + do_ptrs += offset * stride_dom + + offs_m1 += offset + + dk, dv = bwd_dkdv_block_mn( + {{gen_argdefs()}}, + dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN, + off_z, off_hq, offs_n1, offs_m1, + stride_qm, stride_qd, stride_dom, stride_dod, + q_indices, sparse_q_num_blocks, + MATMUL_PRECISION, RCP_LN2, + IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True, + ) + else: + for start_m in range(0, hi): + dk, dv = bwd_dkdv_block_mn( + {{gen_argdefs()}}, + dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN, + off_z, off_hq, offs_n1, offs_m1, + stride_qm, stride_qd, stride_dom, stride_dod, + q_indices, sparse_q_num_blocks, + MATMUL_PRECISION, RCP_LN2, + IS_FULL_BLOCKS, + ) + # Increment pointers. + offset = get_offset_for_next_block( + start_m, q_indices, sparse_q_num_blocks, + SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1 + ) + + qT_ptrs += offset * stride_qm + do_ptrs += offset * stride_dom + + offs_m1 += offset + + return dk, dv + + +@triton.jit +def bwd_dkdv_block_mn( + {{gen_argdefs()}}, + dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN, + off_z, off_hq, offs_n1, offs_m1, + stride_qm, stride_qd, stride_dom, stride_dod, + q_indices, sparse_q_num_blocks, + MATMUL_PRECISION, RCP_LN2, + IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False, +): + {{gen_defines() | indent_except_first(1) }} + + # Load LSE before computing qk to reduce pipeline stall. + if IS_DIVISIBLE: + qT = tl.load(qT_ptrs) + lse = tl.load(LSE + offs_m1) + else: + qT = tl.load(qT_ptrs, mask=offs_m1[None, :] < Q_LEN) + lse = tl.load(LSE + offs_m1, mask=offs_m1 < Q_LEN) + lse = tl.where(lse == -float("inf"), 0.0, lse) + qkT = tl.dot(k, qT, input_precision=FLOAT32_PRECISION) + if not PRESCALE_QK: + qkT *= SM_SCALE + # ~~~~~~~~~~~~~~~~~~~ Apply score modification ~~~~~~~~~~~~~~~~~~~ + if CHECK_BLOCK_BOUNDARY: + m = offs_m1[None, :] % Q_LEN + n = offs_n1[:, None] % KV_LEN + else: + m = offs_m1[None, :] + n = offs_n1[:, None] + pre_mod_scores = qkT + {{ modification( + subgraph_number=0, + output_name="post_mod_scores", + score="qkT", + b="off_z", + h="off_hq", + m="m", + n="n", + out="qkT" + ) | indent_except_first(1) }} + + if CHECK_BLOCK_BOUNDARY: + # Mask out the elements that are out of the KV_LEN for non divisible seqlen. + post_mod_scores = tl.where(offs_n1[:, None] < KV_LEN, post_mod_scores, float("-inf")) + + if not IS_FULL_BLOCKS: + {{ modification( + subgraph_number=2, + output_name="mask_mod_output", + score="qkT", + b="off_z", + h="off_hq", + m="m", + n="n", + ) | indent_except_first(2) }} + if CHECK_BLOCK_BOUNDARY: + mask_mod_output = tl.where(offs_n1[:, None] < KV_LEN, mask_mod_output, float("-inf")) + # (grads) apply mask for fully masked block + post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf")) + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + if not PRESCALE_QK: + post_mod_scores *= RCP_LN2 + pT = tl.math.exp2(post_mod_scores - lse[None, :]) + if IS_DIVISIBLE: + do = tl.load(do_ptrs) + else: + do = tl.load(do_ptrs, mask=offs_m1[:, None] < Q_LEN) + # Compute dV. + ppT = pT + dv += tl.dot(ppT.to(MATMUL_PRECISION), do, input_precision=FLOAT32_PRECISION) + if IS_DIVISIBLE: + Di = tl.load(DELTA + offs_m1) + else: + Di = tl.load(DELTA + offs_m1, mask=offs_m1 < Q_LEN) + # Compute dP and dS. + dpT = tl.dot(v, tl.trans(do), input_precision=FLOAT32_PRECISION) + dsT = pT * (dpT - Di[None, :]) + # ~~~~~~~~~~~~~~~~~~~ Apply joint modification ~~~~~~~~~~~~~~~~~~~ + {{ modification( + subgraph_number=1, + output_name = "grad_scores", + score="pre_mod_scores", + b="off_z", + h="off_hq", + m="m", + n="n", + grad_score_mod="dsT" + ) | indent_except_first(1) }} + if CHECK_BLOCK_BOUNDARY: + grad_scores = tl.where(offs_n1[:, None] < KV_LEN, grad_scores, 0.0) + + dsT = grad_scores + if not IS_FULL_BLOCKS: + if CHECK_BLOCK_BOUNDARY: + mask_mod_output = tl.where(offs_n1[:, None] < KV_LEN, mask_mod_output, float("-inf")) + # (grads) apply mask for partially unmasked block + dsT = tl.where(mask_mod_output, dsT, 0.0) + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + dk += tl.dot(dsT.to(MATMUL_PRECISION), tl.trans(qT), input_precision=FLOAT32_PRECISION) + + return dk, dv + """ + + compute_next_offset_func, +) + + +# TODO: We probably also need a layout constraint? +@register_lowering( + torch.ops.higher_order.flex_attention_backward, type_promotion_kind=None +) +def flex_attention_backward(*args, **kwargs): + ( + query, + key, + value, + out, + logsumexp, + grad_out, + grad_logsumexp, + fw_graph, + joint_graph, + block_mask, + scale, + kernel_options, + score_mod_other_buffers, + mask_mod_other_buffers, + ) = args + ( + kv_num_blocks, + kv_indices, + full_kv_num_blocks, + full_kv_indices, + q_num_blocks, + q_indices, + full_q_num_blocks, + full_q_indices, + SPARSE_KV_BLOCK_SIZE, + SPARSE_Q_BLOCK_SIZE, + mask_graph, + ) = block_mask + + ( + query, + key, + value, + grad_out, + kv_num_blocks, + kv_indices, + full_kv_num_blocks, + full_kv_indices, + q_num_blocks, + q_indices, + full_q_num_blocks, + full_q_indices, + ) = maybe_realize( + [ + query, + key, + value, + grad_out, + kv_num_blocks, + kv_indices, + full_kv_num_blocks, + full_kv_indices, + q_num_blocks, + q_indices, + full_q_num_blocks, + full_q_indices, + ] + ) + + device = query.get_device() + dtype = query.get_dtype() + Bq, Hq, seq_len_q, qk_head_dim = query.get_size() + Bkv, Hkv, seq_len_kv, v_head_dim = value.get_size() + assert Bq == Bkv, "Batch dimension must match" + B = Bq + + kernel_options = dict(kernel_options) + kernel_options.setdefault("FLOAT32_PRECISION", get_float32_precision()) + if seq_len_q % 128 != 0 or seq_len_kv % 128 != 0: + kernel_options.setdefault("IS_DIVISIBLE", False) + else: + kernel_options.setdefault("IS_DIVISIBLE", True) + + fwd_placeholder_inps = [ + create_placeholder(name, dtype, device) + for name, dtype in [ + ("score", dtype), + ("b", torch.int32), + ("h", torch.int32), + ("m", torch.int32), + ("n", torch.int32), + ] + ] + fw_subgraph_buffer = build_subgraph_buffer( + fwd_placeholder_inps + list(score_mod_other_buffers), fw_graph + ) + + joint_placeholder_inps = fwd_placeholder_inps + [ + create_placeholder("grad_score_mod", dtype, device) + ] + joint_subgraph_buffer, *_ = build_subgraph_buffer( + joint_placeholder_inps + list(score_mod_other_buffers), joint_graph + ) + + mask_graph_placeholder_inps = [ + create_placeholder(name, dtype, query.get_device()) + for name, dtype in [ + ("b", torch.int32), + ("h", torch.int32), + ("m", torch.int32), + ("n", torch.int32), + ] + ] + mask_graph_buffer = build_subgraph_buffer( + mask_graph_placeholder_inps + list(mask_mod_other_buffers), mask_graph + ) + + layout_k = FixedLayout( + key.get_device(), + key.get_dtype(), + key.get_size(), + key.get_stride(), + ) + + # Create delta which will is needed for the bwd's kernel + grad_lse_exp2 = lowerings[aten.mul](grad_logsumexp, 1 / math.log(2)) + mul_delta = lowerings[aten.mul](out, grad_out) + delta = lowerings[aten.sum](mul_delta, axis=-1) + delta = lowerings[aten.sub](delta, grad_lse_exp2) + delta = ExternKernel.require_contiguous(delta) + + grad_lse_exp2, delta = maybe_realize([grad_lse_exp2, delta]) + + # see NOTE:[TritonTemplates with multiple outputs] + grad_query = empty_strided( + query.get_size(), query.get_stride(), dtype=dtype, device=device + ) + grad_value = empty_strided( + value.get_size(), value.get_stride(), dtype=dtype, device=device + ) + + kernel_options.setdefault("SM_SCALE", scale) + + # Determine GQA factor + gqa_shared_heads = Hq // Hkv + kernel_options.setdefault("GQA_SHARED_HEADS", gqa_shared_heads) + + # Inside of Triton kernel, only apply partial masking if partial blocks are computed. + # full_kv_num_blocks is torch.zeros([1, 1, 1]) if partial blocks are not computed. + has_full_blocks = full_kv_num_blocks is not None + kernel_options.setdefault("HAS_FULL_BLOCKS", has_full_blocks) + if not has_full_blocks: + full_kv_num_blocks, full_kv_indices, full_q_num_blocks, full_q_indices = ( + empty(0, device=query.get_device()) for _ in range(4) + ) + kernel_options.setdefault("QK_HEAD_DIM", qk_head_dim) + kernel_options.setdefault("V_HEAD_DIM", v_head_dim) + + choices: List[Any] = [] + configs: List[Tuple[int, int, int, int]] = [] + configs.append(_get_default_config_bwd(query)) + if config.max_autotune: + configs.extend( + [ + (BLOCK1, BLOCK2, w, s) + for BLOCK1 in [32, 64] + for BLOCK2 in [32, 64, 128] + for w in [4, 8] + for s in [1, 3, 4, 5] + if BLOCK2 % BLOCK1 == 0 + ] + ) + + for BLOCK1, BLOCK2, num_warps, num_stages in configs: + if ( + SPARSE_KV_BLOCK_SIZE % BLOCK1 != 0 + or SPARSE_Q_BLOCK_SIZE % BLOCK1 != 0 + or SPARSE_KV_BLOCK_SIZE % BLOCK2 != 0 + or SPARSE_Q_BLOCK_SIZE % BLOCK2 != 0 + ): + continue + + # Performance tuning + kernel_options.setdefault("BLOCK_M1", BLOCK1) + kernel_options.setdefault("BLOCK_N1", BLOCK2) + kernel_options.setdefault("BLOCK_M2", BLOCK2) + kernel_options.setdefault("BLOCK_N2", BLOCK1) + # Blocksparse options + kernel_options.setdefault("SPARSE_Q_BLOCK_SIZE", SPARSE_Q_BLOCK_SIZE) + kernel_options.setdefault("SPARSE_KV_BLOCK_SIZE", SPARSE_KV_BLOCK_SIZE) + + flex_attention_backward_template.maybe_append_choice( + choices=choices, + input_nodes=[ + query, + key, + value, + logsumexp, + delta, + grad_out, + grad_query, + grad_value, + kv_num_blocks, + kv_indices, + q_num_blocks, + q_indices, + full_kv_num_blocks, + full_kv_indices, + full_q_num_blocks, + full_q_indices, + ], + layout=layout_k, # We use store_output only for grad_key + subgraphs=[fw_subgraph_buffer, joint_subgraph_buffer, mask_graph_buffer], + mutated_inputs=[grad_query, grad_value], + call_sizes=query.get_size() + key.get_size()[1:3], + num_stages=num_stages, + num_warps=num_warps, + **kernel_options, + ) + inputs_for_autotuning = ( + [ + query, + key, + value, + logsumexp, + delta, + grad_out, + grad_query, + grad_value, + kv_num_blocks, + kv_indices, + q_num_blocks, + q_indices, + full_kv_num_blocks, + full_kv_indices, + full_q_num_blocks, + full_q_indices, + ] + + list(score_mod_other_buffers) + + list(mask_mod_other_buffers) + ) + input_gen_fns = { + 8: create_num_blocks_fake_generator(kv_indices), # kv_num_blocks + 9: create_indices_fake, + 10: create_num_blocks_fake_generator(q_indices), # q_num_blocks + 11: create_indices_fake, + 12: create_num_blocks_fake_generator(full_kv_indices), # full_kv_num_blocks + 13: create_indices_fake, + 14: create_num_blocks_fake_generator(full_q_indices), # full_q_num_blocks + 15: create_indices_fake, + } + + grad_key = autotune_select_algorithm( + "flex_attention_backward", + choices, + inputs_for_autotuning, + layout_k, + input_gen_fns=input_gen_fns, + ) + return ( + grad_query, + grad_key, + grad_value, + ) diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/flex_decoding.py b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/flex_decoding.py new file mode 100644 index 0000000000000000000000000000000000000000..8a5b7c60eafddacfadf4ee25e7e3c943fb48697f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/flex_decoding.py @@ -0,0 +1,570 @@ +# mypy: allow-untyped-defs +""" Triton Implementation of the flex_attention Kernel for short query length (FlexDecoding)""" +from typing import Any, List, Tuple + +import sympy + +import torch +from torch._inductor.virtualized import V + +from .. import config, ir +from ..ir import FixedLayout, FlexibleLayout +from ..lowering import empty, empty_strided, lowerings +from ..runtime.runtime_utils import is_power_of_2, next_power_of_2 +from ..select_algorithm import autotune_select_algorithm, TritonTemplate +from .flex_attention import ( + compute_forward_block_mn, + compute_forward_inner, + compute_next_offset_func, + create_indices_fake, + create_num_blocks_fake_generator, + maybe_realize, +) + + +aten = torch.ops.aten +prims = torch.ops.prims + + +def flex_decoding_grid(batch_size, kv_heads, gqa_group_size, n_keys, d_model, meta): + """How is this kernel parallelized? + We create a grid of (batch_size * kv_heads, SPLIT_KV, 1) + Each block is responsible for iterating over blocks of keys and values calculating + the local output for their tile of keys and values over all full length of query. + groups of SPLIT_KV blocks then combine their output to produce the final result. + """ + + return (batch_size * kv_heads, meta["SPLIT_KV"], 1) + + +flex_decoding_template = TritonTemplate( + name="flex_decoding", + grid=flex_decoding_grid, + source=r""" + {{def_kernel("Q", "K", "V", "M", "L", "KV_NUM_BLKS", "KV_IDX", "FULL_KV_NUM_BLKS", "FULL_KV_IDX")}} + # Sub notation for this kernel: + # Q: Query, K: Key, V: Value + # reduction buffers: M rowmax across local KV split, L local sumexp across local KV split + # M: Number of queries, N: Number of keys/values + # QK_HEAD_DIM: The dimension of the query and key embeddings + # V_HEAD_DIM: The dimension of the value embeddings + # BLOCK_M, QK_HEAD_DIM: M, and D dimemsion are always assigned to the same block + # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head t: Number of kv splits + # (Modifiable) Config options: + # SPLIT_KV: number of blocks K & V are split into + # TILE_KV: length of each local KV split + # BLOCK_M: block size that Q is padded along seqlen dim. + # BLOCK_N: block size of K & V along N dimension. + # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups. + # + # change of base out of the loop + # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row + # is not masked out? If so, we can skip an extra safety check + # SAFE_M_BOUNDARY: Is Q seqlen a multiple of BLOCK_M? If so, we can skip an extra boundary check for loading query. + # SAFE_N_BOUNDARY: Is KV seqlen a multiple of BLOCK_N? If so, we can skip an extra boundary check for loading key/value. + + # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. + # + # SPARSE_KV_BLOCK_SIZE: sparse mask block size along KV seqlen dim. + # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query. + # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query. + # + # + # Output: ACC output accumulated across local KV split. + + tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0) + + # Define Q Strides + stride_qz, stride_qh, stride_qg, stride_qm, stride_qk = {{stride("Q")}} + stride_kz, stride_kh, stride_kn, stride_kk = {{stride("K")}} + stride_vz, stride_vh, stride_vn, stride_vk = {{stride("V")}} + stride_mz, stride_mt, stride_mh, stride_mm = {{stride("M")}} + stride_lz, stride_lt, stride_lh, stride_lm = {{stride("L")}} + + + Z = {{size("Q", 0)}} + HKV = {{size("Q", 1)}} + G: tl.constexpr = GQA_SHARED_HEADS + HQ = HKV * G + Q_LEN = {{size("Q", 3)}} + KV_LEN = {{size("K", 2)}} + + MATMUL_PRECISION = Q.dtype.element_ty + + # Make sure each split is a multiple of BLOCK_N + TILE_KV_OG = tl.cdiv(KV_LEN, SPLIT_KV) + TILE_KV = tl.cdiv(TILE_KV_OG, BLOCK_N) * BLOCK_N + TILE_KV_MULTIPLE: tl.constexpr = (TILE_KV // BLOCK_N) + + off_z = tl.program_id(0) // HKV + off_hkv = tl.program_id(0) % HKV + off_t = tl.program_id(1) + + q_offset = off_z * stride_qz + off_hkv * stride_qh + k_offset = off_z * stride_kz + off_hkv * stride_kh + v_offset = off_z * stride_vz + off_hkv * stride_vh + + SPARSE_Z = {{size("KV_NUM_BLKS", 0)}} + SPARSE_HQ = {{size("KV_NUM_BLKS", 1)}} + + sparse_idx_z = off_z % SPARSE_Z + # TODO: support masks not broadcasted along the head dimension. + tl.device_assert(SPARSE_HQ == 1) + sparse_idx_h = 0 + + SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N) + SPARSE_KV_BLOCK_CNT = tl.cdiv(KV_LEN, SPARSE_KV_BLOCK_SIZE) + + # initialize pointer to m and l + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + acc = tl.zeros([BLOCK_M, V_HEAD_DIM], dtype=tl.float32) + + # initialize offsets + tl.device_assert(BLOCK_M % G == 0) + BLOCK_M_PER_HQ: tl.constexpr = BLOCK_M // G + off_g = tl.arange(0, G) # [G] + offs_g = tl.ravel(tl.broadcast_to(off_g[:, None], [G, BLOCK_M_PER_HQ])) # [BLOCK_M] + offs_hq = offs_g + off_hkv * G + off_m = tl.arange(0, BLOCK_M_PER_HQ) # [BLOCK_M_PER_HQ] + offs_m = tl.ravel(tl.broadcast_to(off_m[None, :], [G, BLOCK_M_PER_HQ])) # [BLOCK_M] + offs_d = tl.arange(0, QK_HEAD_DIM) + offs_vd = tl.arange(0, V_HEAD_DIM) + + # KV_IDX / FULL_KV_IDX and KV_NUM_BLKS / FULL_KV_NUM_BLKS are always contiguous. + sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_h + + # Calculate KV blocks that belong this CTA. + block_n_start = off_t * TILE_KV_MULTIPLE # n_offset inside sparse block + block_n_end = block_n_start + TILE_KV_MULTIPLE # end BLOCK_N + + q_range = stride_qg * off_g[:, None, None] + stride_qm * off_m[None, :, None] + stride_qk * offs_d[None, None, :] + + if SAFE_M_BOUNDARY: + q = tl.load(Q + q_offset + q_range) + else: + mask = off_m[None, :, None] < Q_LEN + q = tl.load(Q + q_offset + q_range, mask) + + q = tl.reshape(q, [BLOCK_M, QK_HEAD_DIM]) + + + # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # Apply both score_mod and mask_mod + + # find first kv block we are loading and the number of blocks we are loading + kv_indices = KV_IDX + sparse_hz_offset * SPARSE_KV_BLOCK_CNT + kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_hz_offset) + indices_idx = block_n_start // SPARSE_KV_MULTIPLE + off_n_block_in_sparse = block_n_start % SPARSE_KV_MULTIPLE + off_n = tl.load(kv_indices + indices_idx) * SPARSE_KV_BLOCK_SIZE + off_n_block_in_sparse * BLOCK_N + # first kv block we're loading + + # last valid block according to sparse mask + block_n_last_valid = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1)) + + K_block_ptr = tl.make_block_ptr( + base=K + k_offset, + shape=(QK_HEAD_DIM, KV_LEN), # (d, N) + strides=(stride_kk, stride_kn), + offsets=(0, off_n), + block_shape=(QK_HEAD_DIM, BLOCK_N), + order=(0, 1) + ) + V_block_ptr = tl.make_block_ptr( + base=V + v_offset, + shape=(KV_LEN, V_HEAD_DIM), + strides=(stride_vn, stride_vk), + offsets=(off_n, 0), + block_shape=(BLOCK_N, V_HEAD_DIM), + order=(1, 0) + ) + offs_n = tl.arange(0, BLOCK_N) + off_n + + acc, l_i, m_i = forward_inner( + {{gen_argdefs()}}, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + # accumulatd values + acc, l_i, m_i, + #offsets + off_z, offs_hq[:, None], offs_m[:, None], offs_n[None, :], + #block sparse data + kv_indices, kv_num_blocks, + block_n_start, block_n_end if block_n_end <= block_n_last_valid else block_n_last_valid, + MATMUL_PRECISION, + IS_FULL_BLOCKS=False, + ) + + + # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # We know these blocks are guaranteed to be "full", so we don't need to + # apply mask_mod to them - only score_mod + if HAS_FULL_BLOCKS: + kv_indices = FULL_KV_IDX + sparse_hz_offset * SPARSE_KV_BLOCK_CNT + kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_hz_offset) + indices_idx = block_n_start // SPARSE_KV_MULTIPLE + off_n_block_in_sparse = block_n_start % SPARSE_KV_MULTIPLE + off_n = tl.load(kv_indices + indices_idx) * SPARSE_KV_BLOCK_SIZE + off_n_block_in_sparse * BLOCK_N + + # last valid block according to sparse mask + block_n_last_valid = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1)) + + K_block_ptr = tl.make_block_ptr( + base=K + k_offset, + shape=(QK_HEAD_DIM, KV_LEN), # (d, N) + strides=(stride_kk, stride_kn), + offsets=(0, off_n), + block_shape=(QK_HEAD_DIM, BLOCK_N), + order=(0, 1) + ) + V_block_ptr = tl.make_block_ptr( + base=V + v_offset, + shape=(KV_LEN, V_HEAD_DIM), + strides=(stride_vn, stride_vk), + offsets=(off_n, 0), + block_shape=(BLOCK_N, V_HEAD_DIM), + order=(1, 0) + ) + offs_n = tl.arange(0, BLOCK_N) + off_n + + acc, l_i, m_i = forward_inner( + {{gen_argdefs()}}, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + # accumulatd values + acc, l_i, m_i, + #offsets + off_z, offs_hq[:, None], offs_m[:, None], offs_n[None, :], + #block sparse data + kv_indices, kv_num_blocks, + block_n_start, block_n_end if block_n_end <= block_n_last_valid else block_n_last_valid, + MATMUL_PRECISION, + IS_FULL_BLOCKS=True, + ) + + m_offset = off_t * stride_mt + off_z * stride_mz + l_offset = off_t * stride_lt + off_z * stride_lz + + M_block_ptr = tl.make_block_ptr( + base=M + m_offset, + shape=(G, Q_LEN), # (G, M) + strides=(stride_mh, stride_mm), + offsets=(off_hkv*G, 0), + block_shape=(G, BLOCK_M_PER_HQ), + order=(1, 0) + ) + L_block_ptr = tl.make_block_ptr( + base=L + l_offset, + shape=(G, Q_LEN), # (G, M) + strides=(stride_lh, stride_lm), + offsets=(off_hkv*G, 0), + block_shape=(G, BLOCK_M_PER_HQ), + order=(1, 0) + ) + + # Store output, logsumexp and rowmax for cross CTA reduction. (all in float32, even when input data are in fp16) + m_i = m_i.reshape(G, BLOCK_M_PER_HQ) + l_i = l_i.reshape(G, BLOCK_M_PER_HQ) + if SAFE_M_BOUNDARY: + tl.store(M_block_ptr, m_i) + tl.store(L_block_ptr, l_i) + else: + tl.store(M_block_ptr, m_i, boundary_check=(1,)) + tl.store(L_block_ptr, l_i, boundary_check=(1,)) + + # -- store output + idx_z = off_z + idx_t = off_t + idx_hq = off_hkv*G + off_g[:, None, None] + idx_m = off_m[None, :, None] + idx_d = offs_vd[None, None, :] + + mask = (idx_m < Q_LEN) + acc = acc.reshape(G, BLOCK_M_PER_HQ, V_HEAD_DIM) + {{store_output(("idx_z", "idx_t", "idx_hq", "idx_m", "idx_d"), "acc", "mask")}} + """ + + compute_forward_inner + + compute_next_offset_func + + compute_forward_block_mn, +) + + +def get_split_k(B: int, H: int, Mk: int, SM: int = 128) -> int: + """Heuristic for the number of splits from xformer""" + bh = max(B * H, 1) # NOTE: Handle B*h=0 case + split_k = SM // bh # Each SM should at least get one block. + split_k = max(split_k, 1) + + return split_k + + +def _get_decoding_default_config(key) -> Tuple[int, int, int]: + dtype = key.get_dtype() + head_dim = key.get_size()[-1] + sm_version = torch.cuda.get_device_capability() + default_config = (64, 2, 1) + if sm_version >= (9, 0): + if head_dim > 128 and dtype == torch.float32: + return default_config + return (64, 2, 3) + return default_config + + +def create_flex_decoding_kernel(*args, **kwargs): + ( + query, + key, + value, + block_mask, + scale, + kernel_options, + score_mod_subgraph, + mask_mod_subgraph, + score_mod_other_buffers, + mask_mod_other_buffers, + ) = args + ( + kv_num_blocks, + kv_indices, + full_kv_num_blocks, # full_kv_num_blocks, + full_kv_indices, # full_kv_indices, + _, # q_num_blocks + _, # q_indices + _, # full_q_num_blocks, + _, # full_q_indices, + SPARSE_KV_BLOCK_SIZE, + _, # SPARSE_Q_BLOCK_SIZE, + _, + ) = block_mask + + Bq, Hq, seq_len_q, qk_head_dim = query.get_size() + Bkv, Hkv, seq_len_kv, v_head_dim = value.get_size() + assert Bq == Bkv, "Batch dimension must match" + B = Bq + kernel_options = dict(kernel_options) + + # TODO: Fix flex decoding non-divisible case! + if seq_len_q % 128 != 0 or seq_len_kv % 128 != 0: + kernel_options.setdefault("IS_DIVISIBLE", False) + else: + kernel_options.setdefault("IS_DIVISIBLE", True) + + # Calculate GQA head sharing + gqa_shared_heads = Hq // Hkv + if not is_power_of_2(gqa_shared_heads): + raise ValueError( + "Number of shared query heads sharing the same KV head must be power of 2. " + ) + kernel_options.setdefault("GQA_SHARED_HEADS", gqa_shared_heads) + + # Determine if there are "full" blocks where we only need to apply score_mod, and can skip mask_mod + has_full_blocks = full_kv_num_blocks is not None + kernel_options.setdefault("HAS_FULL_BLOCKS", has_full_blocks) + if not has_full_blocks: + # Create a plackeholder full block list in case it is empty + full_kv_num_blocks, full_kv_indices = ( + empty(0, device=query.get_device()) for _ in range(2) + ) + + ( + query, + key, + value, + kv_num_blocks, + kv_indices, + full_kv_num_blocks, + full_kv_indices, + ) = maybe_realize( + [ + query, + key, + value, + kv_num_blocks, + kv_indices, + full_kv_num_blocks, + full_kv_indices, + ] + ) + + choices: List[Any] = [] + configs: List[Tuple[int, int, int]] = [] + configs.append(_get_decoding_default_config(key)) + # Note: max_autotune is not supported yet. Causes error in lowering the dynamic shape in reduction ops. + if config.max_autotune: + configs += [ + (64, 2, 2), + (32, 2, 3), + (128, 2, 3), + ] + # TODO: fix autotuning. + + kernel_options.setdefault("SM_SCALE", scale) + kernel_options.setdefault("SPLIT_KV", get_split_k(B, Hkv, seq_len_kv)) + MAX_SPLIT_KV = kernel_options["SPLIT_KV"] + + # create config dependent intermediate buffers + buf_ACC_shape = [B, MAX_SPLIT_KV, Hq, seq_len_q, v_head_dim] + buf_ML_shape = buf_ACC_shape[:-1] + buf_M = empty_strided( + buf_ML_shape, + None, + dtype=torch.float32, # The rowmax is always stored in fp32 regardless of the input dtype + device=query.get_device(), + ) + buf_L = empty_strided( + buf_ML_shape, + None, + dtype=torch.float32, # The intermediate sumexp is always stored in fp32 regardless of the input dtype + device=query.get_device(), + ) + + layout_acc = FixedLayout( + query.get_device(), + torch.float32, + buf_ACC_shape, + FlexibleLayout.contiguous_strides(buf_ACC_shape), + ) + + kernel_options.setdefault("QK_HEAD_DIM", qk_head_dim) + kernel_options.setdefault("V_HEAD_DIM", v_head_dim) + + kernel_options.setdefault( + "BLOCK_M", + ( + # m + # if V.graph.sizevars.evaluate_expr(sympy.Lt(query.get_size()[-2], 0)) + # else # Always use a BLOCK_M > 16 before Triton fix https://github.com/triton-lang/triton/pull/4061 is in pin + max( + next_power_of_2( + V.graph.sizevars.size_hint( + seq_len_q, fallback=torch._inductor.config.unbacked_symint_fallback # type: ignore[arg-type] + ) + * gqa_shared_heads + ), + 16, + ) + ), + ) + + query = ir.ExternKernel.realize_input(query) + stride_b, stride_hq, stride_seq_len_q, stride_qk_head_dim = query.get_stride() + + # Reshape query for GQA: [B, Hq, Mq, D] -> [B, Hkv, G, Mq, D] + gqa_query_shape = (B, Hkv, gqa_shared_heads, seq_len_q, qk_head_dim) + gqa_query_stride = ( + stride_b, + stride_hq * gqa_shared_heads, + stride_hq, + stride_seq_len_q, + stride_qk_head_dim, + ) + query = lowerings[aten.as_strided](query, gqa_query_shape, gqa_query_stride) + + V.graph.sizevars.guard_leq( + seq_len_q * gqa_shared_heads, sympy.Integer(kernel_options["BLOCK_M"]) + ) + + kernel_options.setdefault( + "SAFE_M_BOUNDARY", + ((seq_len_q * gqa_shared_heads) % kernel_options["BLOCK_M"]) == 0, + ) + # TODO: This feels sketchy + kernel_options.setdefault("SAFE_N_BOUNDARY", True) + + # Note, we don't need to pass in the captured buffers explicitly + # because they're implicitly added by the score_mod function + # We do need to explicitly pass it in for autotuning though. + for BLOCK_N, num_warps, num_stages in configs: + if SPARSE_KV_BLOCK_SIZE % BLOCK_N != 0: + continue + + # Performance tuning + kernel_options.setdefault("BLOCK_N", BLOCK_N) + kernel_options.setdefault("SPARSE_KV_BLOCK_SIZE", SPARSE_KV_BLOCK_SIZE) + + # Work around https://github.com/pytorch/pytorch/issues/129625 + if num_stages == 2: + continue + flex_decoding_template.maybe_append_choice( + choices=choices, + input_nodes=[ + query, + key, + value, + buf_M, + buf_L, + kv_num_blocks, + kv_indices, + full_kv_num_blocks, + full_kv_indices, + ], + layout=layout_acc, + subgraphs=[ + score_mod_subgraph, + mask_mod_subgraph, + ], + mutated_inputs=[buf_M, buf_L], + num_stages=num_stages, + num_warps=num_warps, + call_sizes=query.get_size(), + **kernel_options, + ) + + inputs_for_flex_decoding = ( + [ + query, + key, + value, + buf_M, + buf_L, + kv_num_blocks, + kv_indices, + full_kv_num_blocks, + full_kv_indices, + ] + + list(score_mod_other_buffers) + + list(mask_mod_other_buffers) + ) + + input_gen_fns = { + 5: create_num_blocks_fake_generator(kv_indices), + 6: create_indices_fake, + 7: create_num_blocks_fake_generator(full_kv_indices), + 8: create_indices_fake, + } + + buf_ACC = autotune_select_algorithm( + "flex_decoding", + choices, + inputs_for_flex_decoding, + layout_acc, + input_gen_fns=input_gen_fns, + ) + + # Reduction + + g_M = lowerings[aten.max](buf_M, dim=1, keepdim=True)[0] + # See [Note] Handle fully masked out rows: + # g_M Is the global max among split kv blocks. + masked_rows = lowerings[aten.eq](g_M, -float("inf")) + adj_M = lowerings[aten.sub](buf_M, g_M) + adj_M = lowerings[aten.where](masked_rows, 0, adj_M) + alpha = lowerings[aten.exp2](adj_M) + + buf_L = lowerings[aten.mul](buf_L, alpha) + g_L = lowerings[aten.sum](buf_L, axis=1) + masked_rows_squeezed = lowerings[aten.squeeze](masked_rows, dim=1) + g_L = lowerings[aten.where](masked_rows_squeezed, 1.0, g_L) + logsumexp = lowerings[aten.log2](g_L) + logsumexp = lowerings[aten.add](logsumexp, lowerings[aten.squeeze](g_M, dim=1)) + + alpha_unseq = lowerings[aten.unsqueeze](alpha, 4) + buf_ACC = lowerings[aten.mul](buf_ACC, alpha_unseq) + output = lowerings[aten.sum](buf_ACC, axis=1) + L_unseq = lowerings[aten.unsqueeze](g_L, 3) + output = lowerings[aten.div](output, L_unseq) + output = lowerings[prims.convert_element_type](output, query.get_dtype()) + + return ( + output, + logsumexp, + ) diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm.py b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm.py new file mode 100644 index 0000000000000000000000000000000000000000..c2e09300f9b8a7eb6615618c8f34aebebe6e80a2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm.py @@ -0,0 +1,776 @@ +# mypy: allow-untyped-defs +import functools +import logging +from typing import Any, Dict, List, Optional + +import torch +from torch._inductor.autoheuristic.autoheuristic import AutoHeuristicSelectAlgorithm +from torch._inductor.autoheuristic.autoheuristic_utils import ( + AHContext, + context_add_strides, + context_add_using_tf32, + get_mixedmm_precondition, + mixed_mm_operations, + mm_operations, +) +from torch._inductor.codegen.cpp_gemm_template import CppPackedGemmTemplate +from torch._inductor.virtualized import V + +from .. import config as inductor_config +from ..codegen.common import BackendFeature +from ..codegen.cuda.gemm_template import CUTLASS2xGemmTemplate, CUTLASS3xGemmTemplate +from ..codegen.rocm.ck_universal_gemm_template import CKGemmTemplate +from ..codegen.wrapper import WrapperCodeGen +from ..ir import FlexibleLayout, is_triton +from ..lowering import register_lowering +from ..select_algorithm import ( + autotune_select_algorithm, + ExternKernelChoice, + NoValidChoicesError, + TritonTemplate, +) +from ..utils import ( + get_gpu_shared_memory, + use_aten_gemm_kernels, + use_ck_template, + use_cpp_packed_gemm_template, + use_cutlass_template, + use_max_autotune, + use_triton_template, +) +from .mm_common import ( + addmm_epilogue, + extra_mm_configs, + int8_mm_configs, + mixed_mm_configs, + mm_args, + mm_configs, + mm_grid, + mm_options, + triton_config, +) + + +log = logging.getLogger(__name__) +aten = torch.ops.aten + +mm_template = TritonTemplate( + name="mm", + grid=mm_grid, + source=r""" +{{def_kernel("A", "B")}} + M = {{size("A", 0)}} + N = {{size("B", 1)}} + K = {{size("A", 1)}} + if M * N == 0: + # early exit due to zero-size input(s) + return + stride_am = {{stride("A", 0)}} + stride_ak = {{stride("A", 1)}} + stride_bk = {{stride("B", 0)}} + stride_bn = {{stride("B", 1)}} + + # based on triton.ops.matmul + pid = tl.program_id(0) + grid_m = (M + BLOCK_M - 1) // BLOCK_M + grid_n = (N + BLOCK_N - 1) // BLOCK_N + + # re-order program ID for better L2 performance + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // (group_size) + + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + if (stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1): + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + else: + ram = rm % M + if (stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1): + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + else: + rbn = rn % N + rk = tl.arange(0, BLOCK_K) + A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) + B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) + for k in range(K, 0, -BLOCK_K): + if EVEN_K: + a = tl.load(A) + b = tl.load(B) + else: + a = tl.load(A, mask=rk[None, :] < k, other=0.) + b = tl.load(B, mask=rk[:, None] < k, other=0.) + if B_PROLOGUE_CAST_TYPE is not None: + b = b.to(B_PROLOGUE_CAST_TYPE) + acc += tl.dot(a, b, allow_tf32=ALLOW_TF32) + A += BLOCK_K * stride_ak + B += BLOCK_K * stride_bk + + # rematerialize rm and rn to save registers + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + idx_m = rm[:, None] + idx_n = rn[None, :] + mask = (idx_m < M) & (idx_n < N) + + # inductor generates a suffix + {{store_output(("idx_m", "idx_n"), "acc", "mask")}} +""", +) + +aten_mm = ExternKernelChoice(torch.mm, "at::mm_out") + +aten_addmm = ExternKernelChoice( + torch.addmm, "at::addmm_out", op_overload=aten.addmm.default +) + +aten__int_mm = ExternKernelChoice(torch._int_mm, "at::_int_mm") + +aten__sparse_semi_structured_mm = ExternKernelChoice( + torch._sparse_semi_structured_mm, + "at::_sparse_semi_structured_mm", + has_out_variant=False, +) + + +def _is_int8_mat(mat): + return mat.get_dtype() in (torch.int8, torch.uint8) + + +def bias_addmm(inp, mat1, mat2, *, out=None, alpha=1, beta=1): + """ + Giving torch.addmm a 1D tensor calls a different (faster) cublasLt + kernel under the hood. There are a few shapes where this is slower, + but they are rare. + """ + if inp.stride(0) == 0 or inp.size(0) == 1: + return torch.addmm(inp[0], mat1, mat2, out=out, alpha=alpha, beta=beta) + return torch.addmm(inp, mat1, mat2, out=out, alpha=alpha, beta=beta) + + +aten_bias_addmm = ExternKernelChoice(bias_addmm, None) + + +@register_lowering(aten.mm, type_promotion_kind=None) +def tuned_mm(mat1, mat2, *, layout=None): + m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout) + name = "mm" + + aten_layout = layout + if not use_max_autotune(): + aten_layout = FlexibleLayout( + device=layout.device, dtype=layout.dtype, size=layout.size + ) + + # options to tune from + choices = ( + [aten_mm.bind((mat1, mat2), aten_layout)] if use_aten_gemm_kernels() else [] + ) + static_shape, is_nonzero = _is_static_problem([mat1, mat2], layout) + if is_nonzero and use_triton_template(layout): + for config in mm_configs(m, n, k): + mm_template.maybe_append_choice( + choices, + input_nodes=(mat1, mat2), + layout=layout, + **mm_options(config, m, n, k, layout), + ) + if static_shape and is_nonzero and use_cutlass_template(layout, m, n, k): + CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(choices, layout, [mat1, mat2]) + + if is_nonzero and use_ck_template(layout, m, n, k): + CKGemmTemplate.add_ck_gemm_choices(choices, layout, [mat1, mat2]) + + if use_cpp_packed_gemm_template(layout, mat1, mat2): + CppPackedGemmTemplate.add_choices( + choices, + layout, + [mat1, mat2], + ) + + input_nodes = [mat1, mat2] + if ( + is_nonzero + and use_triton_template(layout) + and torch._inductor.config.run_autoheuristic(name) + and is_triton(mat1) + ): + always_included = [] + if use_aten_gemm_kernels(): + always_included.append("extern_mm") + num_choices_before_extra_configs = len(choices) + for config in extra_mm_configs(m, n, k): + mm_template.maybe_append_choice( + choices, + input_nodes=(mat1, mat2), + layout=layout, + **mm_options(config, m, n, k, layout), + ) + + # using AutoHeuristic for ranking + ah_choices = mm_autoheuristic( + mat1, + mat2, + m, + n, + k, + choices, + name, + input_nodes, + mm_operations(), + None, + top_k=10, + always_included=always_included, + ) + if not torch._inductor.config.collect_autoheuristic(name): + # if we are collecting data, we do not want to modify choices + if ah_choices is not None and len(ah_choices) > 0: + # the order in which autoheuristic returns choices is not the same as + # as the order of choices, which affects things like epilogue fusion. + # once epilogue fusion benchmarks choices in sorted order, I think we can + # just use the order returned by autoheuristic + choices = [choice for choice in choices if choice in ah_choices] + else: + choices = choices[:num_choices_before_extra_configs] + + if ( + len(choices) == 0 + and not use_aten_gemm_kernels() + and inductor_config.autotune_fallback_to_aten + ): + log.warning("No choices for GEMM, using ATen backend as fallback") + return aten_mm.bind((mat1, mat2), aten_layout).output_node() + + try: + return autotune_select_algorithm(name, choices, [mat1, mat2], layout) + except NoValidChoicesError: + if not inductor_config.autotune_fallback_to_aten: + raise + log.warning("All choices for GEMM were invalid, using ATen backend as fallback") + return aten_mm.bind((mat1, mat2), aten_layout).output_node() + + +def _is_static_problem(inputs_tensors, layout): + # checks whether all input tensors and the output layout + # have a static shape by attempting to convert the dimensions + # to int + static_shape = True + static_size = WrapperCodeGen.statically_known_list_of_ints_or_none(layout.size) + if static_size is None: + nonzero = True + for s in layout.size: + sz = WrapperCodeGen.statically_known_int_or_none(s) + if sz is not None and sz == 0: + nonzero = False + break + return False, nonzero + numel = 1 + for dim in static_size: + numel *= dim + nonzero = numel > 0 + return static_shape, nonzero + + +@register_lowering(aten._int_mm, type_promotion_kind=None) +def tuned_int_mm(mat1, mat2, *, layout=None): + m, n, k, layout, mat1, mat2 = mm_args( + mat1, mat2, layout=layout, out_dtype=torch.int32 + ) + static_shape, is_nonzero = _is_static_problem([mat1, mat2], layout) + use_cutlass = static_shape and is_nonzero and use_cutlass_template(layout, m, n, k) + + choices = ( + [aten__int_mm.bind((mat1, mat2), layout)] if use_aten_gemm_kernels() else [] + ) + + # TODO: Re-enable eager mode implementation once cuBLAS is fixed + if use_cutlass or use_triton_template(layout, enable_int32=True): + choices = [] + + if use_cutlass: + CUTLASS3xGemmTemplate.add_cutlass_gemm_choices( + choices, layout, [mat1, mat2], fuseable=True, non_fuseable=True + ) + if is_nonzero and use_triton_template(layout, enable_int32=True): + for config in int8_mm_configs(m, n, k): + mm_template.maybe_append_choice( + choices, + input_nodes=(mat1, mat2), + layout=layout, + **mm_options(config, m, n, k, layout), + ) + if len(choices) == 0: + log.warning( + "No choices for integer GEMM avaialbe using configured backends, using ATen backend as fallback" + ) + choices = [aten__int_mm.bind((mat1, mat2), layout)] + + try: + return autotune_select_algorithm("int_mm", choices, [mat1, mat2], layout) + except NoValidChoicesError: + if not inductor_config.autotune_fallback_to_aten: + raise + log.warning("All choices for GEMM were invalid, using ATen backend as fallback") + choices = [aten__int_mm.bind((mat1, mat2), layout)] + return autotune_select_algorithm("int_mm", choices, [mat1, mat2], layout) + + +@register_lowering(aten.addmm, type_promotion_kind=None) +def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None): + ordered_kwargs_for_cpp_kernel = ("beta", "alpha") + m, n, k, layout, mat1, mat2, inp_expanded = mm_args(mat1, mat2, inp, layout=layout) + static_shape, is_nonzero = _is_static_problem([inp, mat1, mat2], layout) + if (not is_nonzero) or (not use_max_autotune()): + # Use a FlexibleLayout if we are not autotuning. + # This allows padding strides for the output. + from torch._inductor.ir import FixedLayout, FlexibleLayout + + if isinstance(layout, FixedLayout): + layout = FlexibleLayout( + device=layout.device, dtype=layout.dtype, size=layout.size + ) + choices = ( + [ + aten_addmm.bind( + (inp, mat1, mat2), + layout, + alpha=alpha, + beta=beta, + ) + ] + if use_aten_gemm_kernels() + else [] + ) + return autotune_select_algorithm("addmm", choices, [inp, mat1, mat2], layout) + + choices = ( + [ + aten_addmm.bind( + (inp_expanded, mat1, mat2), + layout, + alpha=alpha, + beta=beta, + ) + ] + if use_aten_gemm_kernels() + else [] + ) + + if ( + use_aten_gemm_kernels() + and inp_expanded.get_stride()[0] == 0 + and inp_expanded.get_device().type == "cuda" + and inductor_config.triton.autotune_cublasLt + ): + # unexpand inp to make sure fused addmm from cublasLt is used + choices.insert( + 0, + aten_bias_addmm.bind( + (inp_expanded, mat1, mat2), layout, alpha=alpha, beta=beta + ), + ) + + if is_nonzero and use_triton_template(layout): + for config in mm_configs(m, n, k): + mm_template.maybe_append_choice( + choices, + input_nodes=(inp_expanded, mat1, mat2), + layout=layout, + **mm_options(config, m, n, k, layout), + prefix_args=1, + epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta), + ) + + if static_shape and is_nonzero and use_cutlass_template(layout, m, n, k): + # Filter out a known cause of CUDA illegal memory access errors + # broadcasting on the last dim of the bias term seems not to be working + # in the linear GEMM epilogue used by addmm. + if ( + WrapperCodeGen.statically_known_int_or_none(inp_expanded.layout.stride[-1]) + != 0 + ): + CUTLASS3xGemmTemplate.add_cutlass_gemm_choices( + choices, + layout, + [mat1, mat2, inp_expanded], + alpha=alpha, + beta=beta, + ) + + if is_nonzero and use_ck_template(layout, m, n, k): + CKGemmTemplate.add_ck_gemm_choices( + choices, + layout, + [mat1, mat2, inp_expanded], + alpha=alpha, + beta=beta, + ) + + if use_cpp_packed_gemm_template(layout, mat1, mat2): + CppPackedGemmTemplate.add_choices( + choices, + layout, + [inp_expanded, mat1, mat2], + alpha=alpha, + beta=beta, + has_bias=True, + ) + + add_aten_fallback = False + if len(choices) == 0: + log.warning("No choices for GEMM, using ATen backend as fallback") + add_aten_fallback = True + + if add_aten_fallback: + choices.append( + aten_addmm.bind( + (inp_expanded, mat1, mat2), + layout, + ordered_kwargs_for_cpp_kernel, + alpha=alpha, + beta=beta, + ) + ) + + if ( + inp_expanded.get_stride()[0] == 0 + and inp_expanded.get_device().type == "cuda" + and inductor_config.triton.autotune_cublasLt + ): + # unexpand inp to make sure fused addmm from cublasLt is used + choices.insert( + 0, + aten_bias_addmm.bind( + (inp_expanded, mat1, mat2), layout, alpha=alpha, beta=beta + ), + ) + try: + return autotune_select_algorithm( + "addmm", choices, [inp_expanded, mat1, mat2], layout + ) + except NoValidChoicesError: + if not inductor_config.autotune_fallback_to_aten: + raise + log.warning("All choices for GEMM were invalid, using ATen backend as fallback") + fallback_choice = aten_addmm.bind( + (inp, mat1, mat2), + layout, + ordered_kwargs_for_cpp_kernel, + alpha=alpha, + beta=beta, + ) + return fallback_choice.output_node() + + +@register_lowering(aten._sparse_semi_structured_mm, type_promotion_kind=None) +def tuned_sparse_semi_structured_mm( + mat1, mat1_meta, mat2, *, out_dtype=None, layout=None +): + from torch._inductor.select_algorithm import realize_inputs + + mat1, mat1_meta, mat2 = realize_inputs(mat1, mat1_meta, mat2) + m1, k1 = mat1.get_size() + m2, _ = mat1_meta.get_size() + k2, n = mat2.get_size() + m = V.graph.sizevars.guard_equals(m1, m2) + k = V.graph.sizevars.guard_equals(2 * k1, k2) + + if layout is None: + from torch._inductor.ir import FixedLayout + + layout = FixedLayout( + mat2.get_device(), + out_dtype if out_dtype else mat2.get_dtype(), + [m, n], + [n, 1], + ) + else: + assert out_dtype is None, "out_dtype is ignored if layout is specified." + + choices = ( + [ + aten__sparse_semi_structured_mm.bind( + (mat1, mat1_meta, mat2), layout, out_dtype=out_dtype + ) + ] + if use_aten_gemm_kernels() + else [] + ) + + if m * n != 0 and use_cutlass_template(layout, m, n, k): + CUTLASS2xGemmTemplate.add_cutlass_gemm_choices( + choices, layout, [mat1, mat2, mat1_meta], fuseable=True, non_fuseable=True + ) + + return autotune_select_algorithm( + "sparse_semi_structured_mm", choices, [mat1, mat1_meta, mat2], layout + ) + + +def fallback_mixed_mm(mat1, mat2, *, out): + return torch.mm(mat1, mat2.to(mat1.dtype), out=out) + + +aten_fallback_mixed_mm = ExternKernelChoice(fallback_mixed_mm, None) + + +@functools.lru_cache(None) +def _is_sm7x_or_older_gpu(index: Optional[int]) -> bool: + props = torch.cuda.get_device_properties(index or 0) + return props.major <= 7 + + +def dims_are_int(dims): + return all(isinstance(dim, int) for dim in dims) + + +def try_heuristic(m, n, k, choices, mat1, mat2, mat2_dtype, layout): + m, n, k = get_size_hints(mat1, mat2, m, n, k) + if not dims_are_int([m, n, k]): + return None + + if mat1.dtype != torch.float16: + return None + + # only use heuristic if we are running on an A100 + # torch.cuda.get_device_capability() >= (8, 0) returns true for A10G + # which does not have enough shared memory for one of the configs + if ( + not torch.cuda.get_device_capability() >= (8, 0) + ) or get_gpu_shared_memory() != 166912: + return None + + if m == 1 and (n % 16 != 0 or k % 16 != 0): + return None + + if m <= 16 and n >= 4096 and k >= 4096: + return triton_config( + BLOCK_M=16, + BLOCK_N=64, + BLOCK_K=128, + num_stages=5, + num_warps=4, + ) + elif m > 16 and m <= 32 and n >= 4096 and k >= 4096: + return triton_config( + BLOCK_M=32, + BLOCK_N=32, + BLOCK_K=128, + num_stages=5, + num_warps=4, + ) + elif m > 32 and m <= 64 and n >= 4096 and k >= 4096: + return triton_config( + BLOCK_M=64, + BLOCK_N=32, + BLOCK_K=128, + num_stages=5, + num_warps=4, + ) + return None + + +def mm_autoheuristic( + mat1, + mat2, + m, + n, + k, + choices, + name, + input_nodes, + ops, + precondition, + top_k: Optional[int] = None, + always_included=None, +): + m, n, k = get_size_hints(mat1, mat2, m, n, k) + if not dims_are_int([m, n, k]): + return None + mat1_stride, mat2_stride = get_size_hints_strides(mat1, mat2) + + def get_context(m, k, n, mat1, mat2, mat1_stride, mat2_stride): + context = AHContext() + context.add_feature("m", m) + context.add_feature("k", k) + context.add_feature("n", n) + context.add_feature("mat1_dtype", mat1.layout.dtype, is_categorical=True) + context.add_feature("mat2_dtype", mat2.layout.dtype, is_categorical=True) + context_add_strides(context, "mat1", mat1_stride) + context_add_strides(context, "mat2", mat2_stride) + context.add_feature( + "mat1_iscontig", mat1.layout.is_contiguous(), is_categorical=True + ) + context.add_feature( + "mat2_iscontig", mat2.layout.is_contiguous(), is_categorical=True + ) + if name == "mm": + # for mixed_mm, we only consider fp16 + context_add_using_tf32(context, mat1.layout.dtype) + return context + + def fallback(): + return None + + context = get_context(m, k, n, mat1, mat2, mat1_stride, mat2_stride) + autoheuristic = AutoHeuristicSelectAlgorithm( + fallback=fallback, + choices=choices, + input_nodes=input_nodes, + context=context, + name=name, + augment_context=ops, + precondition=precondition, + ) + + if top_k is not None: + # TODO: is there a cleaner way to ensure aten.mm is always included? + return autoheuristic.get_top_k_choices_caller( + top_k, always_included=always_included + ) + + return autoheuristic.get_choice_caller() + + +def get_size_hints(mat1, mat2, m, n, k): + if not isinstance(m, int) or not isinstance(k, int): + (m, k) = V.graph.sizevars.size_hints( + mat1.get_size(), + fallback=torch._inductor.config.unbacked_symint_fallback, + ) + + if not isinstance(n, int) or not isinstance(k, int): + (k, n) = V.graph.sizevars.size_hints( + mat2.get_size(), + fallback=torch._inductor.config.unbacked_symint_fallback, + ) + return m, n, k + + +def get_size_hints_strides(mat1, mat2): + mat1_stride = mat1.layout.stride + mat2_stride = mat2.layout.stride + strides = [mat1_stride, mat2_stride] + strides_hints = [] + for stride in strides: + if not isinstance(stride, int): + stride = V.graph.sizevars.size_hints( + stride, + fallback=torch._inductor.config.unbacked_symint_fallback, + ) + strides_hints.append(stride) + return strides_hints[0], strides_hints[1] + + +def tuned_mixed_mm(mat1, mat2, mat2_dtype): + m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=None) + static_shape, is_nonzero = _is_static_problem([mat1, mat2], layout) + + fallback = aten_fallback_mixed_mm.bind((mat1, mat2), layout) + + choices = [fallback] + + # can't use triton kernel unless one of these is true or if running on v100 (numerical issues) + skip_triton = ( + ( + mat1.layout.dtype != torch.float32 + and not (mat2.layout.is_contiguous() or mat2.layout.is_transposed()) + ) + or _is_sm7x_or_older_gpu(layout.device.index) + or inductor_config.mixed_mm_choice == "aten" + or not V.graph.has_feature(layout.device, BackendFeature.TRITON_TEMPLATES) + or ( + mat1.layout.dtype == torch.float32 and torch.backends.cuda.matmul.allow_tf32 + ) + or (mat1.layout.dtype == torch.bfloat16 and mat2.layout.dtype == torch.uint8) + ) + + if inductor_config.mixed_mm_choice == "triton": + choices = [] + + if not skip_triton: + b_prologue_cast_type = f"tl.{mat2_dtype}".replace("torch.", "") + if static_shape and inductor_config.mixed_mm_choice == "heuristic": + choices = [] + config = try_heuristic(m, n, k, choices, mat1, mat2, mat2_dtype, layout) + if config is not None: + mm_template.maybe_append_choice( + choices, + input_nodes=(mat1, mat2), + layout=layout, + **mm_options(config, m, n, k, layout, b_prologue_cast_type), + ) + choices.append(fallback) + + has_int8_tensor = _is_int8_mat(mat1) or _is_int8_mat(mat2) + for config in mixed_mm_configs(m, n, k, has_int8_tensor=has_int8_tensor): + mm_template.maybe_append_choice( + choices, + input_nodes=(mat1, mat2), + layout=layout, + **mm_options(config, m, n, k, layout, b_prologue_cast_type), + ) + + if static_shape and is_nonzero and use_cutlass_template(layout, m, n, k): + CUTLASS3xGemmTemplate.add_cutlass_gemm_choices( + choices, layout, [mat1, mat2], fuseable=True, non_fuseable=True + ) + CUTLASS2xGemmTemplate.add_cutlass_gemm_choices( + choices, layout, [mat1, mat2], fuseable=True, non_fuseable=True + ) + + if skip_triton and not choices: + choices = [fallback] + + name = "mixed_mm" + input_nodes = [mat1, mat2] + if torch._inductor.config.run_autoheuristic(name): + choice = mm_autoheuristic( + mat1, + mat2, + m, + n, + k, + choices, + name, + input_nodes, + mixed_mm_operations(), + get_mixedmm_precondition, + ) + if ( + not skip_triton + and inductor_config.mixed_mm_choice == "heuristic" + and choice is not None + ): + choices.insert(0, choice) + return autotune_select_algorithm(name, choices, input_nodes, layout) + + +# This op is a special case of the int_mm op which we use based on the pattern +# _int_mm -> mul (defined in ../fx_passes/post_grad.py) in order to prevent +# realization of the int32 _int_mm output by forcing fusion with the mul op. +# This is only used when config.force_fuse_int_mm_with_mul = True +def tuned_fused_int_mm_mul(mat1, mat2, mat3, out_dtype, *, layout=None): + out_dtype = ( + torch.promote_types(mat3.get_dtype(), torch.int32) + if out_dtype is None + else out_dtype + ) + m, n, k, layout, mat1, mat2, mat3 = mm_args( + mat1, mat2, mat3, layout=layout, out_dtype=out_dtype + ) + choices: List[Dict[Any, Any]] = [] + for config in int8_mm_configs(m, n, k): + mm_template.maybe_append_choice( + choices, + input_nodes=(mat1, mat2, mat3), + layout=layout, + **dict(mm_options(config, m, n, k, layout), ACC_TYPE="tl.int32"), + suffix_args=1, + epilogue_fn=V.ops.mul, + ) + return autotune_select_algorithm("int_mm", choices, [mat1, mat2, mat3], layout) diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_common.py b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_common.py new file mode 100644 index 0000000000000000000000000000000000000000..0c66da4ca4f314e09e996c2d9860c8f3b9365086 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_common.py @@ -0,0 +1,466 @@ +# mypy: allow-untyped-defs +import functools +import itertools +import logging +from typing import cast, List, Tuple + +import sympy + +import torch +from torch._inductor.select_algorithm import realize_inputs +from torch._inductor.virtualized import V + +from .. import config as inductor_config +from ..runtime.runtime_utils import next_power_of_2 +from ..utils import ceildiv as cdiv + + +log = logging.getLogger(__name__) + + +def triton_config(num_stages, num_warps, **kwargs): + from triton import Config + + return Config(kwargs, num_stages=num_stages, num_warps=num_warps) + + +def filtered_configs( + m: int, + n: int, + k: int, + configs: List[Tuple[int, int, int, int, int]], + has_int8_tensor=False, +): + """Heuristic to shrink configs when they are bigger than the input size""" + + min_block_size = 16 + # block_k=16 seems to be causing issues + # see: https://github.com/triton-lang/triton/issues/2156#issuecomment-1695897424 + min_block_size_k = 32 if has_int8_tensor else 16 + m = max( + next_power_of_2( + V.graph.sizevars.size_hint( + m, fallback=torch._inductor.config.unbacked_symint_fallback # type: ignore[arg-type] + ) + ), + min_block_size, + ) + n = max( + next_power_of_2( + V.graph.sizevars.size_hint( + n, fallback=torch._inductor.config.unbacked_symint_fallback # type: ignore[arg-type] + ) + ), + min_block_size, + ) + k = max( + next_power_of_2( + V.graph.sizevars.size_hint( + k, fallback=torch._inductor.config.unbacked_symint_fallback # type: ignore[arg-type] + ) + ), + min_block_size_k, + ) + used = set() + for block_m, block_n, block_k, num_stages, num_warps in configs: + # shrink configs for small sizes + block_m = max(min(block_m, m), min_block_size) + block_n = max(min(block_n, n), min_block_size) + block_k = max(min(block_k, k), min_block_size_k) + # each warp computes 16x16 tile = 256 + num_warps = min(num_warps, block_m * block_n // 256) + if torch.version.hip: + for matrix_instr_nonkdim in [0, 16]: + if matrix_instr_nonkdim != 0 and ( + block_m % matrix_instr_nonkdim != 0 + or block_n % matrix_instr_nonkdim != 0 + ): + # block_m and block_n must be a multiple of matrix_instr_nonkdim + continue + if ( + block_m, + block_n, + block_k, + num_stages, + num_warps, + matrix_instr_nonkdim, + ) not in used: + used.add( + ( + block_m, + block_n, + block_k, + num_stages, + num_warps, + matrix_instr_nonkdim, + ) + ) + yield triton_config( + BLOCK_M=block_m, + BLOCK_N=block_n, + BLOCK_K=block_k, + num_stages=num_stages, + num_warps=num_warps, + matrix_instr_nonkdim=matrix_instr_nonkdim, + ) + else: + if (block_m, block_n, block_k, num_stages, num_warps, 0) not in used: + used.add((block_m, block_n, block_k, num_stages, num_warps, 0)) + yield triton_config( + BLOCK_M=block_m, + BLOCK_N=block_n, + BLOCK_K=block_k, + num_stages=num_stages, + num_warps=num_warps, + ) + + +# List of dictionaries to store the kernel configs. Configs that evaluate to true +# will be utilised on the target platform. The configs are as follows: +# (BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps) +mm_kernel_configs = ( + [ + {"config": (32, 32, 16, 1, 2), "cond": True}, + {"config": (32, 32, 128, 2, 4), "cond": torch.version.hip is None}, + {"config": (32, 64, 32, 5, 8), "cond": True}, + {"config": (64, 32, 32, 5, 8), "cond": True}, + {"config": (64, 32, 128, 5, 4), "cond": True}, + {"config": (64, 64, 16, 2, 4), "cond": True}, + {"config": (64, 64, 32, 2, 4), "cond": True}, + {"config": (64, 64, 64, 3, 8), "cond": True}, + {"config": (64, 64, 128, 5, 4), "cond": True}, + {"config": (64, 128, 32, 3, 4), "cond": True}, + {"config": (64, 128, 32, 4, 8), "cond": True}, + {"config": (64, 128, 64, 3, 4), "cond": True}, + {"config": (64, 128, 128, 4, 4), "cond": True}, + {"config": (128, 64, 32, 3, 4), "cond": True}, + {"config": (128, 64, 32, 4, 8), "cond": True}, + {"config": (128, 128, 32, 2, 8), "cond": True}, + {"config": (128, 128, 32, 3, 4), "cond": True}, + {"config": (128, 128, 64, 3, 4), "cond": True}, + {"config": (128, 128, 64, 5, 8), "cond": True}, + ] + if inductor_config.max_autotune_gemm_search_space != "EXHAUSTIVE" + else [ + {"config": (BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps), "cond": True} + for BLOCK_M, BLOCK_N, BLOCK_K in itertools.product( + [16, 32, 64, 128, 256], repeat=3 + ) + for num_stages in [1, 2, 3, 4, 5] + for num_warps in [2, 4, 8] + ] +) + +# these are only used in tuned_mm when AutoHeuristic is enabled +# the idea is that when AutoHeuristic collects data to learn a heuristic, more configs are autotuned +# when the learned heuristic is used, the learned heuristic reduces the number of configs down to 10 +# which saves compilation time (since less configs are autotuned) and potentially increase performance +# because the learned heuristic might predict a config that is not part mm_configs +extra_mm_kernel_configs = [ + {"config": (16, 32, 16, 3, 2), "cond": True}, + {"config": (16, 32, 32, 4, 2), "cond": True}, + {"config": (16, 32, 32, 5, 2), "cond": True}, + {"config": (64, 64, 128, 3, 4), "cond": True}, + {"config": (128, 64, 32, 2, 2), "cond": True}, + {"config": (128, 64, 64, 3, 8), "cond": True}, + {"config": (128, 64, 128, 4, 8), "cond": True}, + {"config": (128, 128, 32, 4, 4), "cond": True}, + {"config": (128, 128, 64, 3, 8), "cond": True}, + {"config": (128, 128, 64, 5, 4), "cond": True}, +] + +int8_mm_kernel_configs = [ + {"config": (64, 64, 32, 2, 4), "cond": True}, + {"config": (64, 128, 32, 3, 4), "cond": True}, + {"config": (128, 64, 32, 3, 4), "cond": True}, + {"config": (64, 128, 32, 4, 8), "cond": True}, + {"config": (128, 64, 32, 4, 8), "cond": True}, + {"config": (64, 32, 32, 5, 8), "cond": True}, + {"config": (32, 64, 32, 5, 8), "cond": True}, + {"config": (128, 128, 32, 2, 8), "cond": True}, + {"config": (64, 64, 64, 3, 8), "cond": True}, + # {"config": (32, 32, 128, 2, 4), "cond": True}, + # {"config": (64, 64, 16, 2, 4), "cond": True}, + # {"config": (32, 32, 16, 1, 2), "cond": True}, + {"config": (128, 256, 128, 3, 8), "cond": torch.version.hip is None}, + {"config": (256, 128, 128, 3, 8), "cond": torch.version.hip is None}, +] + +# Mixed precision kernel configs for small sizes of m for mm's like (16, 8192) x (8192, 8192). +mixed_mm_kernel_configs_small_m = [ + {"config": (16, 128, 256, 3, 4), "cond": True}, + {"config": (16, 128, 256, 5, 8), "cond": True}, +] + +mixed_mm_kernel_configs = ( + mm_kernel_configs + mixed_mm_kernel_configs_small_m + if inductor_config.max_autotune_gemm_search_space != "EXHAUSTIVE" + else mm_kernel_configs +) + +scaled_mm_kernel_configs = [ + {"config": (128, 256, 32, 3, 8), "cond": True}, + {"config": (256, 128, 32, 3, 8), "cond": True}, + {"config": (256, 64, 32, 4, 4), "cond": True}, + {"config": (64, 256, 32, 4, 4), "cond": True}, + {"config": (128, 128, 32, 4, 4), "cond": True}, + {"config": (128, 64, 32, 4, 4), "cond": True}, + {"config": (64, 128, 32, 4, 4), "cond": True}, + {"config": (128, 32, 32, 4, 4), "cond": True}, + {"config": (64, 32, 32, 5, 2), "cond": True}, + {"config": (256, 128, 128, 3, 8), "cond": True}, + {"config": (256, 64, 128, 4, 4), "cond": True}, + {"config": (64, 256, 128, 4, 4), "cond": True}, + {"config": (128, 128, 128, 4, 4), "cond": True}, + {"config": (128, 64, 64, 4, 4), "cond": True}, + {"config": (64, 128, 64, 4, 4), "cond": True}, + {"config": (128, 32, 64, 4, 4), "cond": True}, + {"config": (64, 32, 64, 5, 2), "cond": True}, + {"config": (16, 32, 32, 2, 2), "cond": True}, + {"config": (16, 64, 32, 2, 2), "cond": True}, + {"config": (16, 128, 32, 2, 4), "cond": True}, + {"config": (16, 256, 32, 2, 4), "cond": True}, + {"config": (16, 32, 64, 2, 2), "cond": True}, + {"config": (16, 64, 64, 2, 2), "cond": True}, + {"config": (16, 128, 64, 2, 4), "cond": True}, + {"config": (16, 256, 64, 2, 4), "cond": True}, + {"config": (32, 32, 32, 2, 2), "cond": True}, + {"config": (32, 64, 32, 2, 2), "cond": True}, + {"config": (32, 128, 32, 2, 4), "cond": True}, + {"config": (32, 256, 32, 2, 4), "cond": True}, + {"config": (32, 32, 64, 2, 2), "cond": True}, + {"config": (32, 64, 64, 2, 2), "cond": True}, + {"config": (32, 128, 64, 2, 4), "cond": True}, + {"config": (32, 256, 64, 2, 4), "cond": True}, + {"config": (16, 32, 32, 3, 2), "cond": True}, + {"config": (16, 64, 32, 3, 2), "cond": True}, + {"config": (16, 128, 32, 3, 4), "cond": True}, + {"config": (16, 256, 32, 3, 4), "cond": True}, + {"config": (16, 32, 64, 3, 2), "cond": True}, + {"config": (16, 64, 64, 3, 2), "cond": True}, + {"config": (16, 128, 64, 3, 4), "cond": True}, + {"config": (16, 256, 64, 3, 4), "cond": True}, + {"config": (32, 32, 32, 3, 2), "cond": True}, + {"config": (32, 64, 32, 3, 2), "cond": True}, + {"config": (32, 128, 32, 3, 4), "cond": True}, + {"config": (32, 256, 32, 3, 4), "cond": True}, + {"config": (32, 32, 64, 3, 2), "cond": True}, + {"config": (32, 64, 64, 3, 2), "cond": True}, + {"config": (32, 128, 64, 3, 4), "cond": True}, + {"config": (32, 256, 64, 3, 4), "cond": True}, + {"config": (16, 32, 32, 4, 2), "cond": True}, + {"config": (16, 64, 32, 4, 2), "cond": True}, + {"config": (16, 128, 32, 4, 4), "cond": True}, + {"config": (16, 256, 32, 4, 4), "cond": True}, + {"config": (16, 32, 64, 4, 2), "cond": True}, + {"config": (16, 64, 64, 4, 2), "cond": True}, + {"config": (16, 128, 64, 4, 4), "cond": True}, + {"config": (16, 256, 64, 4, 4), "cond": True}, + {"config": (32, 32, 32, 4, 2), "cond": True}, + {"config": (32, 64, 32, 4, 2), "cond": True}, + {"config": (32, 128, 32, 4, 4), "cond": True}, + {"config": (32, 256, 32, 4, 4), "cond": True}, + {"config": (32, 32, 64, 4, 2), "cond": True}, + {"config": (32, 64, 64, 4, 2), "cond": True}, + {"config": (32, 128, 64, 4, 4), "cond": True}, + {"config": (32, 256, 64, 4, 4), "cond": True}, + {"config": (16, 32, 32, 5, 2), "cond": True}, + {"config": (16, 64, 32, 5, 2), "cond": True}, + {"config": (16, 128, 32, 5, 4), "cond": True}, + {"config": (16, 256, 32, 5, 4), "cond": True}, + {"config": (16, 32, 64, 5, 2), "cond": True}, + {"config": (16, 64, 64, 5, 2), "cond": True}, + {"config": (16, 128, 64, 5, 4), "cond": True}, + {"config": (16, 256, 64, 5, 4), "cond": True}, + {"config": (32, 32, 32, 5, 2), "cond": True}, + {"config": (32, 64, 32, 5, 2), "cond": True}, + {"config": (32, 128, 32, 5, 4), "cond": True}, + {"config": (32, 256, 32, 5, 4), "cond": True}, + {"config": (32, 32, 64, 5, 2), "cond": True}, + {"config": (32, 64, 64, 5, 2), "cond": True}, + {"config": (32, 128, 64, 5, 4), "cond": True}, + {"config": (32, 256, 64, 5, 4), "cond": True}, + {"config": (16, 32, 32, 6, 2), "cond": True}, + {"config": (16, 64, 32, 6, 2), "cond": True}, + {"config": (16, 128, 32, 6, 4), "cond": True}, + {"config": (16, 256, 32, 6, 4), "cond": True}, + {"config": (16, 32, 64, 6, 2), "cond": True}, + {"config": (16, 64, 64, 6, 2), "cond": True}, + {"config": (16, 128, 64, 6, 4), "cond": True}, + {"config": (16, 256, 64, 6, 4), "cond": True}, + {"config": (32, 32, 32, 6, 2), "cond": True}, + {"config": (32, 64, 32, 6, 2), "cond": True}, + {"config": (32, 128, 32, 6, 4), "cond": True}, + {"config": (32, 256, 32, 6, 4), "cond": True}, + {"config": (32, 32, 64, 6, 2), "cond": True}, + {"config": (32, 64, 64, 6, 2), "cond": True}, + {"config": (32, 128, 64, 6, 4), "cond": True}, + {"config": (32, 256, 64, 6, 4), "cond": True}, +] + + +# Create filtered list of configs based on cond evaluation +mm_platform_configs = tuple( + cast(Tuple[int, int, int, int, int], config["config"]) + for config in mm_kernel_configs + if config["cond"] +) +extra_mm_platform_configs = tuple( + cast(Tuple[int, int, int, int, int], config["config"]) + for config in extra_mm_kernel_configs + if config["cond"] +) +int8_platform_configs = tuple( + cast(Tuple[int, int, int, int, int], config["config"]) + for config in int8_mm_kernel_configs + if config["cond"] +) +mixed_mm_platform_configs = tuple( + cast(Tuple[int, int, int, int, int], config["config"]) + for config in mixed_mm_kernel_configs + if config["cond"] +) +scaled_mm_platform_configs = tuple( + cast(Tuple[int, int, int, int, int], config["config"]) + for config in scaled_mm_kernel_configs + if config["cond"] +) + +# On ROCm convert num_stages to 0 to enable software pipelining +if torch.version.hip: + mm_platform_configs = tuple( + (config[0], config[1], config[2], 0, config[4]) + for config in mm_platform_configs + ) + extra_mm_platform_configs = tuple( + (config[0], config[1], config[2], 0, config[4]) + for config in extra_mm_platform_configs + ) + int8_platform_configs = tuple( + (config[0], config[1], config[2], 0, config[4]) + for config in mm_platform_configs + ) + mixed_mm_platform_configs = tuple( + (config[0], config[1], config[2], 0, config[4]) + for config in mixed_mm_platform_configs + ) + scaled_mm_platform_configs = tuple( + (config[0], config[1], config[2], 0, config[4]) + for config in scaled_mm_platform_configs + ) + +mm_configs = functools.partial( + filtered_configs, + configs=mm_platform_configs, +) + +extra_mm_configs = functools.partial( + filtered_configs, + configs=extra_mm_platform_configs, +) + +int8_mm_configs = functools.partial( + filtered_configs, + configs=int8_platform_configs, +) + +mixed_mm_configs = functools.partial( + filtered_configs, + configs=mixed_mm_platform_configs, +) + +scaled_mm_configs = functools.partial( + filtered_configs, + configs=scaled_mm_platform_configs, +) + + +def mm_grid(m, n, meta): + """ + The CUDA grid size for matmul triton templates. + """ + return (cdiv(m, meta["BLOCK_M"]) * cdiv(n, meta["BLOCK_N"]), 1, 1) + + +def acc_type(dtype): + if dtype in (torch.float16, torch.bfloat16): + return "tl.float32" + return f"tl.{dtype}".replace("torch.", "") + + +def mm_options(config, sym_m, sym_n, sym_k, layout, b_prologue_cast_type=None): + """ + Common options to matmul triton templates. + """ + even_k_symbolic = ( + # it isn't worth guarding on this + sympy.gcd(sym_k, config.kwargs["BLOCK_K"]) + == config.kwargs["BLOCK_K"] + ) + allow_tf32 = torch.backends.cuda.matmul.allow_tf32 and ( + not inductor_config.force_same_precision + or ((sym_m % 16) == 0 and (sym_n % 16) == 0 and (sym_k % 8) == 0) + ) + return dict( + GROUP_M=8, + EVEN_K=even_k_symbolic, + ALLOW_TF32=allow_tf32, + ACC_TYPE=acc_type(layout.dtype), + B_PROLOGUE_CAST_TYPE=b_prologue_cast_type, + num_stages=config.num_stages, + num_warps=config.num_warps, + **config.kwargs, + ) + + +def mm_args( + mat1, + mat2, + *others, + layout=None, + out_dtype=None, + use_4x2_dim=False, + mat2_transposed=False, +): + """ + Common arg processing for mm,bmm,addmm,etc + """ + mat1, mat2 = realize_inputs(mat1, mat2) + *b1, m, k1 = mat1.get_size() + if mat2_transposed: + *b2, n, k2 = mat2.get_size() + else: + *b2, k2, n = mat2.get_size() + b = [V.graph.sizevars.guard_equals(a, b) for a, b in zip(b1, b2)] + if use_4x2_dim: + k2 = k2 * 2 + k = V.graph.sizevars.guard_equals(k1, k2) + if layout is None: + from torch._inductor.ir import FixedLayout + + if out_dtype is None: + out_dtype = mat1.get_dtype() + + layout = FixedLayout( + mat1.get_device(), + out_dtype, + [*b, m, n], + ) + else: + assert out_dtype is None, "out_dtype is ignored if layout is specified." + from ..lowering import expand + + others = [realize_inputs(expand(x, layout.size)) for x in others] + + return [m, n, k, layout, mat1, mat2, *others] + + +def addmm_epilogue(dtype, alpha, beta): + def epilogue(acc, bias): + if alpha != 1: + acc = V.ops.mul(acc, V.ops.constant(alpha, dtype)) + if beta != 1: + bias = V.ops.mul(bias, V.ops.constant(beta, dtype)) + return V.ops.add(acc, bias) + + return epilogue diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_plus_mm.py b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_plus_mm.py new file mode 100644 index 0000000000000000000000000000000000000000..f432cbb13b6f14d1429aa6a4df0579015ef46975 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_plus_mm.py @@ -0,0 +1,248 @@ +# mypy: allow-untyped-defs +import functools + +import torch + +from ..lowering import lowerings +from ..select_algorithm import ( + autotune_select_algorithm, + ExternKernelChoice, + TritonTemplate, +) +from ..utils import use_aten_gemm_kernels, use_triton_template +from ..virtualized import V +from .mm_common import mm_args, mm_grid, mm_options + + +aten = torch.ops.aten + +aten_mm_plus_mm = ExternKernelChoice( + torch.ops.inductor._mm_plus_mm, "torch::inductor::_mm_plus_mm" +) + +mm_plus_mm_template = TritonTemplate( + name="mm_plus_mm", + grid=mm_grid, + debug=False, + source=r""" +{{def_kernel("A", "B", "C", "D")}} + M = {{size("A", 0)}} + N = {{size("B", 1)}} + K1 = {{size("A", 1)}} + if M * N == 0: + # early exit due to zero-size input(s) + return + # K2 = {{size("C", 1)}} + stride_am = {{stride("A", 0)}} + stride_ak = {{stride("A", 1)}} + stride_bk = {{stride("B", 0)}} + stride_bn = {{stride("B", 1)}} + stride_cm = {{stride("C", 0)}} + stride_ck = {{stride("C", 1)}} + stride_dk = {{stride("D", 0)}} + stride_dn = {{stride("D", 1)}} + + # based on triton.ops.matmul + pid = tl.program_id(0) + grid_m = (M + BLOCK_M - 1) // BLOCK_M + grid_n = (N + BLOCK_N - 1) // BLOCK_N + + # re-order program ID for better L2 performance + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // (group_size) + + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + + if (((stride_am == 1 and stride_ak == M) or (stride_am == K1 and stride_ak == 1)) + and ((stride_cm == 1 and stride_ck == M) or (stride_cm == K1 and stride_ck == 1))): + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + else: + ram = rm % M + + if (((stride_bk == 1 and stride_bn == K1) or (stride_bk == N and stride_bn == 1)) + and ((stride_dk == 1 and stride_dn == K1) or (stride_dk == N and stride_dn == 1))): + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + else: + rbn = rn % N + + rk = tl.arange(0, BLOCK_K) + A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) + B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) + C = C + (ram[:, None] * stride_cm + rk[None, :] * stride_ck) + D = D + (rk[:, None] * stride_dk + rbn[None, :] * stride_dn) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) + for k1 in range(K1, 0, -BLOCK_K): + # First matmul with A @ B + if EVEN_K: + a = tl.load(A) + b = tl.load(B) + else: + a = tl.load(A, mask=rk[None, :] < k1, other=0.) + b = tl.load(B, mask=rk[:, None] < k1, other=0.) + acc += tl.dot(a, b, allow_tf32=ALLOW_TF32) + A += BLOCK_K * stride_ak + B += BLOCK_K * stride_bk + + for k2 in range(K1, 0, -BLOCK_K): + + # Second matmul with C @ D + if EVEN_K: + c = tl.load(C) + d = tl.load(D) + else: + c = tl.load(C, mask=rk[None, :] < k2, other=0.) + d = tl.load(D, mask=rk[:, None] < k2, other=0.) + acc += tl.dot(c, d, allow_tf32=ALLOW_TF32) + C += BLOCK_K * stride_ck + D += BLOCK_K * stride_dk + + + idx_m = rm[:, None] + idx_n = rn[None, :] + mask = (idx_m < M) & (idx_n < N) + + # inductor generates a suffix + {{store_output(("idx_m", "idx_n"), "acc", "mask")}} +""", +) + + +@functools.lru_cache(None) +def mm_configs(): + import triton + + # List of dictionaries to store the kernel configs. Configs that evaluate to true + # will be utilised on the target platform + mm_triton_configs = [ + { + "config": {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32}, + "num_stages": 2, + "num_warps": 4, + "cond": True, + }, + { + "config": {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32}, + "num_stages": 3, + "num_warps": 8, + "cond": True, + }, + { + "config": {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32}, + "num_stages": 4, + "num_warps": 16, + "cond": True, + }, + { + "config": {"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32}, + "num_stages": 4, + "num_warps": 8, + "cond": True, + }, + { + "config": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32}, + "num_stages": 4, + "num_warps": 8, + "cond": True, + }, + { + "config": {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32}, + "num_stages": 1, + "num_warps": 8, + "cond": True, + }, + { + "config": {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 64}, + "num_stages": 1, + "num_warps": 8, + "cond": True, + }, + { + "config": {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 128}, + "num_stages": 1, + "num_warps": 8, + "cond": torch.version.hip is None, + }, + { + "config": {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 16}, + "num_stages": 2, + "num_warps": 4, + "cond": True, + }, + { + "config": {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 16}, + "num_stages": 1, + "num_warps": 2, + "cond": True, + }, + ] + + # Filter out configs in which cond evaluates to true + # On ROCm convert num_stages to 1 as pipelining provides no benefit + if torch.version.hip: + filtered_configs = [ + triton.Config(c["config"], num_stages=1, num_warps=c["num_warps"]) + for c in mm_triton_configs + if c["cond"] + ] + else: + filtered_configs = [ + triton.Config( + c["config"], num_stages=c["num_stages"], num_warps=c["num_warps"] + ) + for c in mm_triton_configs + if c["cond"] + ] + + return filtered_configs + + +def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None): + """ + Computes mm(mat1, mat2) + mm(mat3, mat4) + """ + m1, n1, k1, layout1, mat1, mat2 = mm_args(mat1, mat2, layout=layout) + m2, n2, _, layout2, mat3, mat4 = mm_args(mat3, mat4, layout=layout) + # Optimization is optional, because we can always just not do the fusion + if ( + m1 * n1 == 0 + or m2 * n2 == 0 + or not V.graph.sizevars.statically_known_list_equals( + mat1.get_size(), mat3.get_size() + ) + or not V.graph.sizevars.statically_known_list_equals( + mat2.get_size(), mat4.get_size() + ) + ): + # TODO(jansel): support different K values when this is fixed: + # https://github.com/openai/triton/issues/967 + return lowerings[aten.add]( + lowerings[aten.mm](mat1, mat2), lowerings[aten.mm](mat3, mat4) + ) + + assert layout1 == layout2 + # options to tune from + choices = ( + [aten_mm_plus_mm.bind((mat1, mat2, mat3, mat4), layout1)] + if use_aten_gemm_kernels() + else [] + ) + if use_triton_template(layout1): + for config in mm_configs(): + # see https://github.com/openai/triton/issues/1298 + # BLOCK_K = K causes llvm error + if V.graph.sizevars.statically_known_lt(config.kwargs["BLOCK_K"], k1): + mm_plus_mm_template.maybe_append_choice( + choices, + input_nodes=(mat1, mat2, mat3, mat4), + layout=layout1, + **mm_options(config, m1, n1, k1, layout1), + ) + + return autotune_select_algorithm( + "mm_plus_mm", choices, [mat1, mat2, mat3, mat4], layout1 + ) diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_scaled.py b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_scaled.py new file mode 100644 index 0000000000000000000000000000000000000000..2f0d020716a199c802d07a19e906e20abb7e04ef --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_scaled.py @@ -0,0 +1,311 @@ +import logging +from typing import Any, Dict, List, Optional, Tuple + +import sympy + +import torch + +from .. import config as inductor_config +from ..ir import ChoiceCaller, Layout, StorageBox, TensorBox +from ..lowering import add_layout_constraint, constrain_to_fx_strides, register_lowering +from ..select_algorithm import ( + autotune_select_algorithm, + ExternKernelChoice, + NoValidChoicesError, + realize_inputs, + TritonTemplate, +) +from ..utils import use_aten_gemm_kernels, use_triton_template +from .mm import _is_static_problem # TODO(yangsiyu) move to mm_common +from .mm_common import mm_args, mm_grid, scaled_mm_configs + + +log = logging.getLogger(__name__) +aten = torch.ops.aten + + +scaled_mm_template = TritonTemplate( + name="scaled_mm", + grid=mm_grid, + source=r""" +{{def_kernel("A", "B", "A_inverse_scale", "B_inverse_scale")}} + M = {{size("A", 0)}} + N = {{size("B", 1)}} + K = {{size("A", 1)}} + if M * N == 0: + # early exit due to zero-size input(s) + return + stride_am = {{stride("A", 0)}} + stride_ak = {{stride("A", 1)}} + stride_bk = {{stride("B", 0)}} + stride_bn = {{stride("B", 1)}} + + # based on triton.ops.matmul + pid = tl.program_id(0) + grid_m = (M + BLOCK_M - 1) // BLOCK_M + grid_n = (N + BLOCK_N - 1) // BLOCK_N + + # re-order program ID for better L2 performance + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // (group_size) + + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + rk = tl.arange(0, BLOCK_K) + A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) + B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) + for k in range(K, 0, -BLOCK_K): + if EVEN_K: + a = tl.load(A) + b = tl.load(B) + else: + a = tl.load(A, mask=rk[None, :] < k, other=0.) + b = tl.load(B, mask=rk[:, None] < k, other=0.) + if B_PROLOGUE_CAST_TYPE is not None: + b = b.to(B_PROLOGUE_CAST_TYPE) + if USE_FAST_ACCUM: + acc = tl.dot(a, b, acc, out_dtype=ACC_TYPE) + else: + acc += tl.dot(a, b, out_dtype=ACC_TYPE) + A += BLOCK_K * stride_ak + B += BLOCK_K * stride_bk + + if SCALING_ROWWISE: + inv_a_scale_row = tl.load(A_inverse_scale + rm, mask=rm < M) + inv_b_scale_row = tl.load(B_inverse_scale + rn, mask=rn < N) + inv_scale_row = inv_a_scale_row[:, None] * inv_b_scale_row[None, :] + acc *= inv_scale_row + else: + # for tensor-wise scaling, the scales are scalars + inv_a_scale = tl.load(A_inverse_scale) + inv_b_scale = tl.load(B_inverse_scale) + inv_scale = inv_a_scale * inv_b_scale + acc *= inv_scale + + # rematerialize rm and rn to save registers + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + + idx_m = rm[:, None] + idx_n = rn[None, :] + mask = (idx_m < M) & (idx_n < N) + + # inductor generates a suffix + {{store_output(("idx_m", "idx_n"), "acc", "mask")}} +""", +) + + +# Inductor does not allow optional tensor input arguments currently (pass None as an +# input node to template choices), but since for _scaled_mm there is only one such arg +# (bias), work around by having a second template when bias is provided. +scaled_mm_bias_template = TritonTemplate( + name="scaled_mm_bias", + grid=mm_grid, + source=r""" +{{def_kernel("A", "B", "A_inverse_scale", "B_inverse_scale", "bias_ptr")}} + M = {{size("A", 0)}} + N = {{size("B", 1)}} + K = {{size("A", 1)}} + if M * N == 0: + # early exit due to zero-size input(s) + return + stride_am = {{stride("A", 0)}} + stride_ak = {{stride("A", 1)}} + stride_bk = {{stride("B", 0)}} + stride_bn = {{stride("B", 1)}} + + # based on triton.ops.matmul + pid = tl.program_id(0) + grid_m = (M + BLOCK_M - 1) // BLOCK_M + grid_n = (N + BLOCK_N - 1) // BLOCK_N + + # re-order program ID for better L2 performance + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // (group_size) + + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + rk = tl.arange(0, BLOCK_K) + A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) + B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) + for k in range(K, 0, -BLOCK_K): + if EVEN_K: + a = tl.load(A) + b = tl.load(B) + else: + a = tl.load(A, mask=rk[None, :] < k, other=0.) + b = tl.load(B, mask=rk[:, None] < k, other=0.) + if B_PROLOGUE_CAST_TYPE is not None: + b = b.to(B_PROLOGUE_CAST_TYPE) + if USE_FAST_ACCUM: + acc = tl.dot(a, b, acc, out_dtype=ACC_TYPE) + else: + acc += tl.dot(a, b, out_dtype=ACC_TYPE) + A += BLOCK_K * stride_ak + B += BLOCK_K * stride_bk + + if SCALING_ROWWISE: + inv_a_scale_row = tl.load(A_inverse_scale + rm, mask=rm < M) + inv_b_scale_row = tl.load(B_inverse_scale + rn, mask=rn < N) + inv_scale_row = inv_a_scale_row[:, None] * inv_b_scale_row[None, :] + acc *= inv_scale_row + else: + # for tensor-wise scaling, the scales are scalars + inv_a_scale = tl.load(A_inverse_scale) + inv_b_scale = tl.load(B_inverse_scale) + inv_scale = inv_a_scale * inv_b_scale + acc *= inv_scale + + # rematerialize rm and rn to save registers + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + + # bias + bias = tl.load(bias_ptr + rn, mask=rn < N) + acc += bias + + idx_m = rm[:, None] + idx_n = rn[None, :] + mask = (idx_m < M) & (idx_n < N) + + # inductor generates a suffix + {{store_output(("idx_m", "idx_n"), "acc", "mask")}} +""", +) + + +aten__fp8_mm = ExternKernelChoice(torch._scaled_mm, "at::_scaled_mm") + + +def are_compatible_scales(size_a: List[int], size_b: List[int]) -> bool: + # Same sized scales are compatable + if len(size_a) == len(size_b): + return True + + # Both need to be scalars or len(1) tensors + if len(size_a) <= 1 and len(size_b) <= 1: + return True + + return False + + +def scaled_mm_options( # type: ignore[no-untyped-def] + config, # triton.Config + sym_m: sympy.core.numbers.Integer, + sym_n: sympy.core.numbers.Integer, + sym_k: sympy.core.numbers.Integer, + layout: Layout, + scale_a: StorageBox, + scale_b: StorageBox, + use_fast_accum: bool, + b_prologue_cast_type: Optional[str] = None, +) -> Dict[str, Any]: + even_k_symbolic = ( + sympy.gcd(sym_k, config.kwargs["BLOCK_K"]) == config.kwargs["BLOCK_K"] + ) + + size_a, size_b = scale_a.get_size(), scale_b.get_size() + assert are_compatible_scales(size_a, size_b), ( + "Expect scale_a and scale_b to be either both scalars (including single-element tensors) " + f"or 1-dimensional tensors with the same size. Got scale_a: {len(size_a)} and scale_b: {len(size_b)}." + ) + return dict( + GROUP_M=8, + EVEN_K=even_k_symbolic, + ACC_TYPE="tl.float32", + B_PROLOGUE_CAST_TYPE=b_prologue_cast_type, + USE_FAST_ACCUM=use_fast_accum, + num_stages=config.num_stages, + num_warps=config.num_warps, + # tensor-wise scaling if scalar scales + SCALING_ROWWISE=len(scale_a.get_size()) == 2, + **config.kwargs, + ) + + +add_layout_constraint(aten._scaled_mm.default, constrain_to_fx_strides) + + +@register_lowering(aten._scaled_mm.default, type_promotion_kind=None) # type: ignore[misc] +def tuned_scaled_mm( + mat_a: TensorBox, + mat_b: TensorBox, + scale_a: TensorBox, + scale_b: TensorBox, + bias: Optional[TensorBox] = None, + scale_result: Optional[TensorBox] = None, + out_dtype: Optional[torch.dtype] = None, + use_fast_accum: bool = False, + layout: Optional[Layout] = None, +) -> TensorBox: + m, n, k, layout, mat_a, mat_b = mm_args( + mat_a, mat_b, layout=layout, out_dtype=out_dtype + ) + scale_a, scale_b = realize_inputs(scale_a, scale_b) + + input_nodes: Tuple[Any, ...] + # workaround for Inductor not supporting optional tensor input arguments + if bias is None: + input_nodes = (mat_a, mat_b, scale_a, scale_b) + triton_template = scaled_mm_template + else: + bias = realize_inputs(bias) + input_nodes = (mat_a, mat_b, scale_a, scale_b, bias) + triton_template = scaled_mm_bias_template + + aten_choice = aten__fp8_mm.bind( + input_nodes, layout, out_dtype=out_dtype, use_fast_accum=use_fast_accum + ) + + choices: List[ChoiceCaller] = [] + if use_aten_gemm_kernels(): + choices.append(aten_choice) + + static_shape, is_nonzero = _is_static_problem([mat_a, mat_b], layout) + if is_nonzero and use_triton_template(layout, enable_float8=True): + for config in scaled_mm_configs(m, n, k): + if k == 16 and config.kwargs["BLOCK_M"] >= 64: + continue # Triton crashes in this case + kwargs = scaled_mm_options( + config, m, n, k, layout, scale_a, scale_b, use_fast_accum + ) + # possibly appends a TritonTemplateCaller to choices + triton_template.maybe_append_choice( + choices, + input_nodes=input_nodes, + layout=layout, + **kwargs, + ) + + if ( + len(choices) == 0 + and not use_aten_gemm_kernels() + and inductor_config.autotune_fallback_to_aten + ): + log.warning("No choices for scaled_mm, using ATen backend as fallback") + return aten_choice.output_node() + + try: + return autotune_select_algorithm("scaled_mm", choices, input_nodes, layout) + except NoValidChoicesError: + if not inductor_config.autotune_fallback_to_aten: + raise + log.warning( + "All choices for scaled_mm were invalid, using ATen backend as fallback" + ) + return aten_choice.output_node() diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/unpack_mixed_mm.py b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/unpack_mixed_mm.py new file mode 100644 index 0000000000000000000000000000000000000000..674da97c165546acf8397f57e52930e60a7c195d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/unpack_mixed_mm.py @@ -0,0 +1,87 @@ +# mypy: allow-untyped-defs +import logging +from typing import List, TYPE_CHECKING + +from ..select_algorithm import autotune_select_algorithm, TritonTemplate +from .mm_common import mm_args, mm_configs, mm_grid, mm_options + + +if TYPE_CHECKING: + from ..ir import ChoiceCaller + +log = logging.getLogger(__name__) + +uint4x2_mixed_mm_template = TritonTemplate( + name="uint4x2_mixed_mm", + grid=mm_grid, + source=r""" +{{def_kernel("A", "B")}} + M = {{size("A", 0)}} + N = {{size("B", 1)}} + K = {{size("A", 1)}} + stride_am = {{stride("A", 0)}} + stride_ak = {{stride("A", 1)}} + stride_bk = {{stride("B", 0)}} + stride_bn = {{stride("B", 1)}} + + # based on triton.ops.matmul + pid = tl.program_id(0) + grid_m = (M + BLOCK_M - 1) // BLOCK_M + grid_n = (N + BLOCK_N - 1) // BLOCK_N + + # re-order program ID for better L2 performance + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // (group_size) + + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + rk = tl.arange(0, BLOCK_K) + A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) + B = B + (rk[:, None]//2 * stride_bk + rbn[None, :] * stride_bn) + b_shifts = 4*(rk%2) + b_subs = 8*(1-(rk%2)) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) + for k in range(K, 0, -BLOCK_K): + if EVEN_K: + a = tl.load(A) + b = tl.load(B) + else: + a = tl.load(A, mask=rk[None, :] < k, other=0.) + b = tl.load(B, mask=rk[:, None] < k, other=0.) + b = ((b >> b_shifts[:, None]) & 0xF) - 8 + b = b.to(B_PROLOGUE_CAST_TYPE) + acc += tl.dot(a, b, allow_tf32=ALLOW_TF32) + A += BLOCK_K * stride_ak + B += BLOCK_K//2 * stride_bk + + # rematerialize rm and rn to save registers + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + idx_m = rm[:, None] + idx_n = rn[None, :] + mask = (idx_m < M) & (idx_n < N) + + # inductor generates a suffix + {{store_output(("idx_m", "idx_n"), "acc", "mask")}} +""", +) + + +def tuned_uint4x2_mixed_mm(mat1, mat2, mat2_mm_shape, mat2_dtype): + m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=None, use_4x2_dim=True) + choices: List[ChoiceCaller] = [] + b_prologue_cast_type = f"tl.{mat2_dtype}".replace("torch.", "") + for config in mm_configs(m, n, k): + uint4x2_mixed_mm_template.maybe_append_choice( + choices, + input_nodes=(mat1, mat2), + layout=layout, + **mm_options(config, m, n, k, layout, b_prologue_cast_type), + ) + return autotune_select_algorithm("uint4x2_mixed_mm", choices, [mat1, mat2], layout) diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__init__.py b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2241377aeefe75227902da2c6c01c6589bd4cd39 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/autotune_cache.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/autotune_cache.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff2cb55adc68cf76d5cb07ea8702ecbfcf678286 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/autotune_cache.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/benchmarking.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/benchmarking.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb83530d1e742b489d74b58814e69bf998e96ace Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/benchmarking.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/compile_tasks.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/compile_tasks.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf012aa405a51756a414e8b2610a9260738ee55a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/compile_tasks.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/coordinate_descent_tuner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/coordinate_descent_tuner.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a2befe423bd41386df420a2bdb5247e441a26f2b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/coordinate_descent_tuner.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/halide_helpers.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/halide_helpers.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..03fe9fdf15b578e19d1e209739ec0510de2ec577 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/halide_helpers.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/hints.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/hints.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d500c5f757c4e72a55a12338de26e6cc4fec1acb Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/hints.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/runtime_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/runtime_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..92697cbc01d7696d9b3e8cddfef6d4ed84456118 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/runtime_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/triton_helpers.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/triton_helpers.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f9eaa9ee60447b080aa2913381df9c54397a06e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/triton_helpers.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/triton_heuristics.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/triton_heuristics.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..83d455ac3591b74faa7e7e5114f9e18d4b56cd9f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/triton_heuristics.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/autotune_cache.py b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/autotune_cache.py new file mode 100644 index 0000000000000000000000000000000000000000..65dfc73d63d72d89f736ada79d7b9c7179e9dfb7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/autotune_cache.py @@ -0,0 +1,237 @@ +from __future__ import annotations + +import dataclasses +import hashlib +import logging +import os +import os.path +from typing import Dict, List, Optional, Tuple +from typing_extensions import override + +import torch +from torch.utils._triton import has_triton_package + +from ..remote_cache import ( + JsonDataTy, + RemoteCache, + RemoteCacheBackend, + RemoteCacheJsonSerde, +) + + +if has_triton_package(): + from triton import Config + +log = logging.getLogger(__name__) + + +_InductorMetaTy = Dict[str, object] + + +@dataclasses.dataclass +class AutotuneCache: + configs_hash: str + filename: str + local_cache: Optional[Tuple[RemoteCache[JsonDataTy], str]] = None + remote_cache: Optional[Tuple[RemoteCache[JsonDataTy], str]] = None + + # Create a AutotuneCache. Returns None if none of the caches can be used. + @staticmethod + def create( + inductor_meta: _InductorMetaTy, filename: str, configs_hash: str + ) -> Optional[AutotuneCache]: + cache = AutotuneCache(configs_hash, filename) + cache._setup_local_cache(inductor_meta, filename) + cache._setup_remote_autotune_cache(inductor_meta, filename) + if cache.local_cache or cache.remote_cache: + return cache + else: + return None + + # Read the best config options from the most local cache and return it. + def _read(self, inductor_meta: _InductorMetaTy) -> Optional[Dict[str, JsonDataTy]]: + if local_cache := self.local_cache: + cache, key = local_cache + if best_config := cache.get(key): + if isinstance(best_config, dict): + return best_config + + if remote_cache := self.remote_cache: + cache, key = remote_cache + if best_config := cache.get(key): + if isinstance(best_config, dict): + return best_config + + return None + + # Read the best config options from the most local cache and figure out + # which `configs` represents that option. + def read_best( + self, inductor_meta: _InductorMetaTy, configs: List[Config] + ) -> Optional[Config]: + if best := self._read(inductor_meta): + return _load_cached_autotuning( + best, self.configs_hash, configs, inductor_meta + ) + return None + + # Set up local filesystem caching information + def _setup_local_cache(self, inductor_meta: _InductorMetaTy, filename: str) -> None: + if not inductor_meta.get("autotune_local_cache", True): + return + + cache_filename = os.path.splitext(filename)[0] + ".best_config" + local_cache = RemoteCache(_LocalAutotuneCacheBackend(), RemoteCacheJsonSerde()) + self.local_cache = (local_cache, cache_filename) + + # Set up remote caching information + def _setup_remote_autotune_cache( + self, inductor_meta: _InductorMetaTy, filename: str + ) -> None: + if not _should_use_remote_autotune_cache(inductor_meta): + return + + remote_cache = _create_cache( + inductor_meta, + self.configs_hash, + "FbRemoteAutotuneCache", + "RemoteAutotuneCache", + "autotune-best-config-v2", + ) + if not remote_cache: + return + + # we already sha256 hash the source contents + remote_cache_key = os.path.basename(filename) + self.remote_cache = (remote_cache, remote_cache_key) + + # Save the config in the caches + def save( + self, config: Config, time_taken_ns: int, found_by_coordesc: bool = False + ) -> None: + data = { + **config.kwargs, + "num_warps": config.num_warps, + "num_stages": config.num_stages, + "configs_hash": self.configs_hash, + "found_by_coordesc": found_by_coordesc, + "time_taken_ms": time_taken_ns // 1000000, # Convert from NS to MS + } + + if local_cache := self.local_cache: + cache, key = local_cache + cache.put(key, data) + + if log.isEnabledFor(logging.DEBUG): + type_str = "coordesc" if found_by_coordesc else "heuristic" + log.debug("Save %s tuning result to %s", type_str, key) + + if remote_cache := self.remote_cache: + cache, key = remote_cache + cache.put(key, data) + + +def _should_use_remote_autotune_cache(inductor_meta: Dict[str, object]) -> bool: + if (config := inductor_meta.get("autotune_remote_cache")) is not None: + return bool(config) + if not inductor_meta.get("is_fbcode"): + return False + if torch._utils_internal.is_fb_unit_test(): + return False + if inductor_meta.get("is_hip"): + return False + + try: + from torch._inductor.fb.remote_cache import REMOTE_CACHE_VERSION + except ModuleNotFoundError: + return False + + return REMOTE_CACHE_VERSION >= torch._utils_internal.justknobs_getval_int( + "pytorch/remote_cache:autotune_memcache_version" + ) + + +def _load_cached_autotuning( + best_config: Dict[str, JsonDataTy], + configs_hash: str, + configs: List[Config], + inductor_meta: Dict[str, object], +) -> Optional[Config]: + if best_config is None: + return None + if best_config.pop("configs_hash", None) != configs_hash: + return None + + # Remove time taken for comparison + best_config.pop("time_taken_ms", None) + + if inductor_meta.get("coordinate_descent_tuning") and best_config.pop( + "found_by_coordesc", False + ): + num_warps = best_config.pop("num_warps") + num_stages = best_config.pop("num_stages") + triton_config = Config(best_config, num_warps=num_warps, num_stages=num_stages) + triton_config.found_by_coordesc = True + return triton_config + + matching_configs = [ + cfg + for cfg in configs + if all(val == best_config.get(key) for key, val in cfg.kwargs.items()) + and cfg.num_warps == best_config.get("num_warps") + and cfg.num_stages == best_config.get("num_stages") + ] + if len(matching_configs) != 1: + return None + + return matching_configs[0] + + +def _create_cache( + inductor_meta: Dict[str, object], + configs_hash: str, + fb_cache_cls: str, + oss_cache_cls: str, + salt: str, +) -> Optional[RemoteCache[JsonDataTy]]: + backend_hash = inductor_meta.get("backend_hash", None) + if backend_hash is None: + log.debug( + "backend_hash is not passed on the inductor_meta, unable to use autotune remote cache" + ) + return None + + assert isinstance(backend_hash, str) + + key = backend_hash + configs_hash + salt + key = hashlib.sha256(key.encode("utf-8")).hexdigest() + + try: + if inductor_meta.get("is_fbcode"): + import torch._inductor.fb.remote_cache + + cache_cls = getattr(torch._inductor.fb.remote_cache, fb_cache_cls) + return cache_cls(key) + else: + import torch._inductor.remote_cache + + cache_cls = getattr(torch._inductor.remote_cache, oss_cache_cls) + return cache_cls(key) + except Exception: + log.warning("Unable to create a remote cache", exc_info=True) + return None + + +class _LocalAutotuneCacheBackend(RemoteCacheBackend[bytes]): + @override + def get(self, key: str) -> Optional[bytes]: + try: + with open(key, "rb") as fd: + return fd.read() + except FileNotFoundError: + return None + + @override + def put(self, key: str, data: bytes) -> None: + with open(key, "wb") as fd: + fd.write(data) diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/benchmarking.py b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/benchmarking.py new file mode 100644 index 0000000000000000000000000000000000000000..44c222c3f936eff7b1f2ddc5a8953c5dd1e82539 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/benchmarking.py @@ -0,0 +1,204 @@ +import time +from functools import cached_property, wraps +from itertools import chain +from statistics import median +from typing import Any, Callable, Dict, List, Tuple +from typing_extensions import Concatenate, ParamSpec, Self, TypeVar + +import torch +from torch._dynamo.utils import counters + + +logger = torch._logging.getArtifactLogger(__name__, "benchmarking") + + +MILLISECONDS_PER_SECOND = 1000 + +P = ParamSpec("P") +T = TypeVar("T") + + +def maybe_time( + fn: Callable[Concatenate[Any, P], T] +) -> Callable[Concatenate[Any, P], T]: + """Wrapper that logs the duration of `fn`, in milliseconds, along with a representation + of the function's args and kwargs, if logging is enabled. It is expected that `fn` is + a method of `Benchmarker` or one of its subclasses; typing limitations prevent us from + declaring this directly. If logging is disabled, this becomes a no-op. + """ + + # no-op if benchmarking-specific logging is disabled + if not torch._logging._internal.log_state.is_artifact_enabled("benchmarking"): + return fn + + @wraps(fn) + def wrapper(self: Any, *args: P.args, **kwargs: P.kwargs) -> T: + start_t = time.perf_counter() + result = fn(*args, **kwargs) + logger.debug( + "Call `benchmarking.%s.%s(*args=%r, **kwargs=%r)` took %f milliseconds.", + self.__class__.__name__, + fn.__name__, + args, + kwargs, + (time.perf_counter() - start_t) * MILLISECONDS_PER_SECOND, + ) + return result + + return wrapper + + +def count(fn: Callable[Concatenate[Any, P], T]) -> Callable[Concatenate[Any, P], T]: + """Wrapper that increments relevant dynamo counters on `fn` call. It is expected that + `fn` is a method of `Benchmarker` or one of its subclass; typing limitations prevent + us from declaring this directly. The counter incrementation follows the formula, + + `counters["inductor"]["benchmarking.Foo.bar] += 1` + + where `Foo` is the class whose' instance called the function, and `bar` is the function name. + """ + + @wraps(fn) + def wrapper(self: Any, *args: P.args, **kwargs: P.kwargs) -> T: + counters["inductor"][ + "benchmarking." + self.__class__.__name__ + "." + fn.__name__ + ] += 1 + return fn(self, *args, **kwargs) + + return wrapper + + +class Benchmarker: + def __init__(self: Self) -> None: + pass + + @maybe_time + @count + def benchmark( + self: Self, + fn: Callable[..., Any], + fn_args: Tuple[Any], + fn_kwargs: Dict[str, Any], + **kwargs: Any, + ) -> float: + """Benchmark `fn(*fn_args, *fn_kwargs)` and return the runtime, in milliseconds (the + actual runtime calculation is dictated by the benchmarking implementation, but may be + one of [mean, median, minimum, etc.]). Functions as a convenience wrapper around + device-specific implementations, like `benchmark_cpu` and `benchmark_gpu`. Raises + `ValueError(...)` if we can't safely infer the device type of `fn`; for example, + if multiple device types are found in `fn_args` and `fn_kwargs`, or if no device + types are found. + + Arguments: + - fn: The function to benchmark. + - fn_args: The function's arguments. + - fn_kwargs: The function's kwargs. + + Keyword Arguments: + - **kwargs: The benchmarking implementation's kwargs. + + Returns: + - The runtime of `fn(*fn_args, **fn_kwargs)`, in milliseconds. + """ + inferred_device = None + for arg_or_kwarg in chain(fn_args, fn_kwargs.values()): + if not isinstance(arg_or_kwarg, torch.Tensor): + continue + if inferred_device is None: + inferred_device = arg_or_kwarg.device + elif arg_or_kwarg.device != inferred_device: + raise ValueError( + "Can't safely infer the device type of `fn` with multiple device types in `fn_args` and `fn_kwargs`!" + ) + if inferred_device is None: + raise ValueError( + "Can't safely infer the device type of `fn` with no device types in `fn_args` or `fn_kwargs`! You should be calling `.benchmark_cpu` or `.benchmark_gpu` directly." # noqa: B950 + ) + _callable = lambda: fn(*fn_args, **fn_kwargs) # noqa: E731 + if inferred_device == torch.device("cpu"): + return self.benchmark_cpu(_callable, **kwargs) + # TODO(nmacchioni): For non-CPU functions we default to using the GPU-specific benchmarking + # implementation which was written specifically with CUDA devices in mind, we may want to + # explore alternate implementations for other device types. + return self.benchmark_gpu(_callable, **kwargs) + + @maybe_time + @count + def benchmark_cpu( + self: Self, _callable: Callable[[], Any], warmup: int = 20, rep: int = 100 + ) -> float: + """Benchmark the CPU callable, `_callable`, and return the median runtime, + in milliseconds. + + Arguments: + - _callable: The CPU callable to benchmark. + + Keyword Arguments: + - warmup: Optionally, the duration, in milliseconds, to run `_callable` + before benchmarking starts. + - rep: Optionally, the duration, in milliseconds, to run `_callable` + during benchmarking. + + Returns: + - The median runtime of `_callable`, in milliseconds. + """ + + def run_for(ms: int) -> List[float]: + timings = [] + run_start_t = time.perf_counter() + while True: + start_t = time.perf_counter() + _callable() + end_t = time.perf_counter() + timings.append((end_t - start_t) * MILLISECONDS_PER_SECOND) + if ((end_t - run_start_t) * MILLISECONDS_PER_SECOND) > ms: + break + return timings + + run_for(warmup) + return median(run_for(rep)) + + @count + def benchmark_gpu(self: Self, *args: Any, **kwargs: Any) -> float: + raise NotImplementedError + + +class TritonBenchmarker(Benchmarker): + @cached_property + @maybe_time + @count + def triton_do_bench(self: Self) -> Callable[..., Any]: + """Lazily import Triton's `do_bench`.""" + try: + from triton.testing import do_bench + except ImportError as e: + raise NotImplementedError("requires Triton") from e + return do_bench + + @maybe_time + @count + def benchmark_gpu(self: Self, _callable: Callable[[], Any], **kwargs: Any) -> float: + """Benchmark the GPU callable, `_callable`, and return the runtime, in milliseconds. + + Arguments: + - _callable: The GPU callable to benchmark. + + Keyword Arguments: + - quantiles: Optionally, a tuple of floats denoting the requested quantiles. + - return_mode: Optionally, the requested return mode. Currently, Triton's + `do_bench` supports min, max, mean, and median return modes. + - **kwargs: Additional kwargs passed to Triton's `do_bench`. + + Returns: + - The runtime of `callable`, in milliseconds. If `kwargs["quantiles"]` is specified, + this is the first requested quantile. Else, if `kwargs["return_mode"]` is specified, + this is the requested return mode. Otherwise, this is the median. + """ + if "quantiles" in kwargs: + return self.triton_do_bench(_callable, **kwargs)[0] + elif "return_mode" in kwargs: + return self.triton_do_bench(_callable, **kwargs) + return self.triton_do_bench(_callable, **kwargs, return_mode="median") + + +benchmarker = TritonBenchmarker() diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/compile_tasks.py b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/compile_tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..17788ab7920fb0c23c03ddb8caaa805a82a03785 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/compile_tasks.py @@ -0,0 +1,68 @@ +# mypy: allow-untyped-defs +from __future__ import annotations + +import functools +import os +import sys +import warnings +from types import ModuleType +from typing import Any, Callable, Dict + + +def _reload_triton_kernel_in_subproc(reload_module, kernel_name): + return _module_to_triton_kernel(reload_module(), kernel_name) + + +def _module_to_triton_kernel(mod, kernel_name): + kernel = getattr(mod, kernel_name) + kernel._reload_in_subproc = functools.partial( + _reload_triton_kernel_in_subproc, + mod._reload_in_subproc, + kernel_name, + ) + return kernel + + +def _reload_python_module_in_subproc(key, path): + codecache = sys.modules.get("torch._inductor.codecache") + if codecache: + return codecache.PyCodeCache.load_by_key_path(key, path) + else: + return _reload_python_module(key, path) + + +def _reload_python_module(key, path): + with open(path) as f: + try: + code = compile(f.read(), path, "exec", dont_inherit=True) + except Exception as e: + raise RuntimeError( + f"Failed to import {path}\n{type(e).__name__}: {e}" + ) from None + mod = ModuleType(f"{__name__}.{key}") + mod.__file__ = path + mod.key = key # type: ignore[attr-defined] + exec(code, mod.__dict__, mod.__dict__) + sys.modules[mod.__name__] = mod + return mod + + +@functools.lru_cache(None) +def _set_triton_ptxas_path() -> None: + if os.environ.get("TRITON_PTXAS_PATH") is not None: + return + ptxas_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "bin", "ptxas") + ) + if not os.path.exists(ptxas_path): + return + if os.path.isfile(ptxas_path) and os.access(ptxas_path, os.X_OK): + os.environ["TRITON_PTXAS_PATH"] = ptxas_path + else: + warnings.warn(f"{ptxas_path} exists but is not an executable") + + +def _worker_compile_triton(load_kernel: Callable[[], Any], extra_env: Dict[str, str]): + _set_triton_ptxas_path() + os.environ.update(extra_env) + load_kernel().precompile(warm_cache_only=True) diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/coordinate_descent_tuner.py b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/coordinate_descent_tuner.py new file mode 100644 index 0000000000000000000000000000000000000000..4c2af613a04cac07258eddb6abd2248c89936ed7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/coordinate_descent_tuner.py @@ -0,0 +1,304 @@ +# mypy: allow-untyped-defs +import copy +import itertools +import logging +from typing import Callable, Optional + +from .hints import TRITON_MAX_BLOCK +from .runtime_utils import red_text, triton_config_to_hashable + + +try: + import triton +except ImportError: + triton = None + +log = logging.getLogger(__name__) + + +def get_field(config, name): + if name == "num_warps": + return config.num_warps + elif name == "num_stages": + return config.num_stages + else: + return config.kwargs.get(name, None) + + +def set_field(config, name, value): + if name == "num_warps": + config.num_warps = value + elif name == "num_stages": + config.num_stages = value + else: + config.kwargs[name] = value + + +class CoordescTuner: + """ + The coordinate descent tuner. Tune one field/coordinate at a time. + + TODO will it be necessary to tune multiple fields simultaneously. + + + TODO: what if both increasing and decreasing a field can improve perf. + i.e., there are multiple local optima.. + """ + + def __init__( + self, is_mm=False, name="unknown", size_hints=None, inductor_meta=None + ): + self.is_mm = is_mm # we will tune num_stages for mm + self.cached_benchmark_results = {} + self.name = name + self.size_hints = size_hints + self.inductor_meta = inductor_meta or {} + + def prefix_to_size_hint(self, prefix: str) -> Optional[int]: + size_hint_idx = {"X": 0, "Y": 1, "Z": 2, "R": -1}[prefix] + + have_size_hint = ( + self.size_hints is not None + and len(self.size_hints) > 0 + and len(self.size_hints) > size_hint_idx + ) + return self.size_hints[size_hint_idx] if have_size_hint else None + + def get_config_max(self, prefix: str) -> int: + max_block = TRITON_MAX_BLOCK[prefix] + size_hint = self.prefix_to_size_hint(prefix) + return min(max_block, size_hint) if size_hint is not None else max_block + + def get_warpsmax(self): + # Currently, CUDA has a maximum of 1024 threads, so 32 is the max + # number of warps. + return 1024 // 32 + + def cache_benchmark_result(self, config, timing): + self.cached_benchmark_results[triton_config_to_hashable(config)] = timing + + def lookup_in_cache(self, config): + return self.cached_benchmark_results.get(triton_config_to_hashable(config)) + + def call_func(self, func, config): + found = self.lookup_in_cache(config) + if found is not None: + log.debug(" CACHED") + return found + timing = func(config) + self.cache_benchmark_result(config, timing) + return timing + + @property + def tunable_fields(self): + out = [ + "XBLOCK", + "YBLOCK", + "ZBLOCK", + # NOTE: we should not tune RBLOCK for persistent reduction. + # We rely on the fact that persistent reduction's triton.Config + # does not have the RBLOCK field to guarantee that. + "RBLOCK", + # the following 3 are for mm + "BLOCK_M", + "BLOCK_N", + "BLOCK_K", + "num_warps", + ] + if self.is_mm: + out.append("num_stages") + + return out + + def value_too_large(self, name: str, val: int) -> bool: + if name in {"XBLOCK", "YBLOCK", "ZBLOCK", "RBLOCK"}: + return val > self.get_config_max(name[0]) + if name == "num_warps": + return val > self.get_warpsmax() + + return False + + def get_neighbour_values(self, name, orig_val, radius=1, include_self=False): + """ + Get neighbour values in 'radius' steps. The original value is not + returned as it's own neighbour. + """ + assert radius >= 1 + + def update(cur_val, inc=True): + if name == "num_stages": + if inc: + return cur_val + 1 + else: + return cur_val - 1 + else: + if inc: + return cur_val * 2 + else: + return cur_val // 2 + + out = [] + # increment loop + cur_val = orig_val + for _ in range(radius): + cur_val = update(cur_val, True) + if self.value_too_large(name, cur_val): + break + out.append(cur_val) + + # decrement loop + cur_val = orig_val + for _ in range(radius): + cur_val = update(cur_val, False) + if cur_val <= 0: + break + out.append(cur_val) + + if include_self: + out.append(orig_val) + return out + + @staticmethod + def has_improvement(baseline, test): + threshold = 0.001 # 0.1% + return test is not None and test < baseline * (1 - threshold) + + def check_all_tuning_directions( + self, + func: Callable[["triton.Config"], float], + best_config, + best_timing, + ): + """ + Check all directions. We only do this once the regular coordinate + descent tuning find no better choices any more. + We only have a few tunable fields, so this should be fine. + """ + candidate_values_list = [] + effective_fields = [] + for field in self.tunable_fields: + old_value = get_field(best_config, field) + if old_value is None: + continue + candidate_values = self.get_neighbour_values( + field, + old_value, + radius=self.inductor_meta.get("coordinate_descent_search_radius", 1), + include_self=True, + ) + candidate_values_list.append(candidate_values) + effective_fields.append(field) + + choices = itertools.product(*candidate_values_list) + improved = False + for choice in choices: + assert len(choice) == len(effective_fields) + candidate_config = copy.deepcopy(best_config) + for new_val, field in zip(choice, effective_fields): + set_field(candidate_config, field, new_val) + cmp_res, candidate_timing = self.compare_config( + func, candidate_config, best_config, best_timing + ) + if cmp_res: + improved = True + best_config = candidate_config + best_timing = candidate_timing + + return improved, best_config, best_timing + + def compare_config(self, func, candidate_config, best_config, best_timing): + """ + Check if candidate_config is better than best_config. + + Return a touple of (compare_result, candidate_timing). + compare_result is true iff candidate_config is better. + """ + log.debug("Try config %s", candidate_config) + try: + candidate_timing = self.call_func(func, candidate_config) + except Exception as e: + log.debug("Got exception %s", e) + return False, float("inf") + + if self.has_improvement(best_timing, candidate_timing): + log.debug( + "Tune from %s %f -> %s %f", + best_config, + best_timing, + candidate_config, + candidate_timing, + ) + + return True, candidate_timing + return False, candidate_timing + + def autotune( + self, + func: Callable[["triton.Config"], float], + baseline_config: "triton.Config", + baseline_timing: Optional[float] = None, + ) -> "triton.Config": + if baseline_timing is None: + baseline_timing = self.call_func(func, baseline_config) + + log.debug("= Do coordinate descent tuning for %s =", self.name) + log.debug( + "Baseline Config %s, baseline timing %f", baseline_config, baseline_timing + ) + improved = True + best_config = baseline_config + best_timing = baseline_timing + tunable_fields = self.tunable_fields + + while improved: + improved = False + + for name in tunable_fields: + cur_val = get_field(best_config, name) + # some kernel don't have RBLOCK/YBLOCK/ZBLOCK. So cur_val may be None + if cur_val is None: + continue + + # It's possible that candidate_values is empty. + # E.g., if XBLOCK is 1 initially and size_hint for x is also 1. + # We would not try either larger or smaller XBLOCK in this case. + candidate_values = self.get_neighbour_values(name, cur_val) + + for next_val in candidate_values: + candidate_config = copy.deepcopy(best_config) + set_field(candidate_config, name, next_val) + + cmp_res, candidate_timing = self.compare_config( + func, candidate_config, best_config, best_timing + ) + if cmp_res: + improved = True + best_config, best_timing = candidate_config, candidate_timing + + if not improved and self.inductor_meta.get( + "coordinate_descent_check_all_directions" + ): + old_best_timing = best_timing + improved, best_config, best_timing = self.check_all_tuning_directions( + func, best_config, best_timing + ) + + if improved: + msg = red_text( + "Coordinate descend tuning found improvement of %.3fx by looking in all directions." + ) + log.debug( + msg, + old_best_timing / best_timing, + ) + + log.debug( + "Improve from %s %f -> %s %f, %.3fx", + baseline_config, + baseline_timing, + best_config, + best_timing, + baseline_timing / best_timing, + ) + + return best_config diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/halide_helpers.py b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/halide_helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..db813b0c051a19706937ff05e838aea4f45619f3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/halide_helpers.py @@ -0,0 +1,118 @@ +# mypy: allow-untyped-defs +try: + import halide as hl # type: ignore[import-untyped, import-not-found] +except ImportError: + hl = None + +PHILOX_N_ROUNDS_DEFAULT = 10 # Default number of rounds for philox + +if hl is not None: + PHILOX_KEY_A_U32 = hl.u32(0x9E3779B9) + PHILOX_KEY_B_U32 = hl.u32(0xBB67AE85) + PHILOX_ROUND_A_U32 = hl.u32(0xD2511F53) + PHILOX_ROUND_B_U32 = hl.u32(0xCD9E8D57) +else: + PHILOX_KEY_A_U32 = None + PHILOX_KEY_B_U32 = None + PHILOX_ROUND_A_U32 = None + PHILOX_ROUND_B_U32 = None + + +def _pair_uniform_to_normal(u1, u2): + """Box-Muller transform""" + u1 = hl.max(hl.f32(1.0e-7), u1) + th = hl.f32(6.283185307179586) * u2 + r = hl.sqrt(hl.f32(-2.0) * hl.log(u1)) + return r * hl.cos(th), r * hl.sin(th) + + +def _uint_to_uniform_float(x): + """ + Numerically stable function to convert a random uint into a random float uniformly sampled in [0, 1). + """ + + # TODO: + # conditions can be simplified + # scale is ((2**23 - 1) / 2**23) * 2**(N_BITS - 1) + # https://github.com/triton-lang/triton/blob/e4a0d93ff1a367c7d4eeebbcd7079ed267e6b06f/python/triton/language/random.py#L116-L132. + assert x.type() == hl.UInt(32) or x.type() == hl.Int(32) + x = hl.cast(hl.Int(32), x) + scale = hl.f64(4.6566127342e-10) + x = hl.select(x < 0, -x - 1, x) + return x * scale + + +def philox_impl(c0, c1, c2, c3, k0, k1, n_rounds): + def umulhi(a, b): + a = hl.cast(hl.UInt(64), a) + b = hl.cast(hl.UInt(64), b) + return hl.cast(hl.UInt(32), ((a * b) >> 32) & hl.u64(0xFFFFFFFF)) + + for _ in range(n_rounds): + _c0, _c2 = c0, c2 + + c0 = umulhi(PHILOX_ROUND_B_U32, _c2) ^ c1 ^ k0 + c2 = umulhi(PHILOX_ROUND_A_U32, _c0) ^ c3 ^ k1 + c1 = PHILOX_ROUND_B_U32 * _c2 + c3 = PHILOX_ROUND_A_U32 * _c0 + # raise key + k0 = k0 + PHILOX_KEY_A_U32 + k1 = k1 + PHILOX_KEY_B_U32 + + return c0, c1, c2, c3 + + +def halide_philox(seed, c0, c1, c2, c3, n_rounds): + seed = hl.cast(hl.UInt(64), seed) + + assert c0.type().bits() == 32 + + seed_hi = hl.cast(hl.UInt(32), (seed >> 32) & hl.u64(0xFFFFFFFF)) + seed_lo = hl.cast(hl.UInt(32), seed & hl.u64(0xFFFFFFFF)) + + return philox_impl(c0, c1, c2, c3, seed_lo, seed_hi, n_rounds) + + +def randint4x(seed, offset, n_rounds): + offset = hl.cast(hl.UInt(32), offset) + _0 = hl.u32(0) + return halide_philox(seed, offset, _0, _0, _0, n_rounds) + + +def rand4x(seed, offset, n_rounds=PHILOX_N_ROUNDS_DEFAULT): + i1, i2, i3, i4 = randint4x(seed, offset, n_rounds) + u1 = _uint_to_uniform_float(i1) + u2 = _uint_to_uniform_float(i2) + u3 = _uint_to_uniform_float(i3) + u4 = _uint_to_uniform_float(i4) + return u1, u2, u3, u4 + + +def randint(seed, offset, n_rounds=PHILOX_N_ROUNDS_DEFAULT): + ret, _, _, _ = randint4x(seed, offset, n_rounds) + return ret + + +def rand(seed, offset, n_rounds=PHILOX_N_ROUNDS_DEFAULT): + source = randint(seed, offset, n_rounds) + return _uint_to_uniform_float(source) + + +def randn(seed, offset): + i1, i2, _, _ = randint4x(seed, offset, PHILOX_N_ROUNDS_DEFAULT) + u1 = _uint_to_uniform_float(i1) + u2 = _uint_to_uniform_float(i2) + n1, _ = _pair_uniform_to_normal(u1, u2) + return n1 + + +def randint64(seed, offset, low, high): + r0, r1, r2, r3 = randint4x(seed, offset, PHILOX_N_ROUNDS_DEFAULT) + r0 = hl.cast(hl.UInt(64), r0) + r1 = hl.cast(hl.UInt(64), r1) + + result = r0 | (r1 << 32) + size = high - low + result = result % hl.cast(hl.UInt(64), size) + result = hl.cast(hl.Int(64), result) + low + return result diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/hints.py b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/hints.py new file mode 100644 index 0000000000000000000000000000000000000000..90f9e5a1cc5966bf0416c947cf63ea29a281113d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/hints.py @@ -0,0 +1,179 @@ +# mypy: allow-untyped-defs +import collections +import typing +from dataclasses import fields +from enum import auto, Enum +from typing import Dict, List, Optional, Union + + +# NOTE: if these fail asserts submit a PR to increase them +TRITON_MAX_BLOCK = { + "X": 2048, + "Y": 1024, + "Z": 1024, + "R": 4096 * 16, # * 16 is multi-kernel only +} + + +class ReductionHint(Enum): + INNER = 0 + OUTER = 1 + OUTER_TINY = 2 + DEFAULT = 3 + + +class TileHint(Enum): + SQUARE = 0 + DEFAULT = 1 + + +# Attempt to import AttrsDescriptor from Triton +try: + from triton.compiler.compiler import AttrsDescriptor + + attrs_descriptor_available = True + # Determine if 'ids_of_folded_args' is a valid field for AttrsDescriptor + attr_desc_fields = {f.name for f in fields(AttrsDescriptor)} + ids_of_folded_args_available = "ids_of_folded_args" in attr_desc_fields + divisible_by_8_available = "divisible_by_8" in attr_desc_fields +except ImportError: + attrs_descriptor_available = False + +# Define `instance_descriptor` function with clear conditional handling +if attrs_descriptor_available: + + def instance_descriptor( + divisible_by_16=None, + equal_to_1=None, + ids_of_folded_args=None, + divisible_by_8=None, + ): + # Prepare the arguments for AttrsDescriptor + kwargs = { + "divisible_by_16": divisible_by_16, + "equal_to_1": equal_to_1, + } + + # Conditionally add 'ids_of_folded_args' if it's available in AttrsDescriptor + if ids_of_folded_args_available: + kwargs["ids_of_folded_args"] = ids_of_folded_args + if divisible_by_8_available: + kwargs["divisible_by_8"] = divisible_by_8 + + # Instantiate AttrsDescriptor with the prepared arguments + return AttrsDescriptor(**kwargs) + +else: + # Define a namedtuple as a fallback when AttrsDescriptor is not available + instance_descriptor = collections.namedtuple( # type: ignore[no-redef] + "instance_descriptor", + ["divisible_by_16", "equal_to_1", "ids_of_folded_args", "divisible_by_8"], + defaults=[(), (), (), ()], + ) + + +_NUM_THREADS_PER_WARP = 32 + + +class HeuristicType(Enum): + PERSISTENT_REDUCTION = auto() + POINTWISE = auto() + REDUCTION = auto() + SPLIT_SCAN = auto() + TEMPLATE = auto() + USER_AUTOTUNE = auto() + + +class AutotuneHint(Enum): + ELEMENTS_PER_WARP_32 = 0 + + # Triton codegen tries to codegen set of AutotuneHints. + # Enum.__repr__ looks like """ + # which isn't valid python. + # Enum.__str__ will just return "AutotuneHint.ELEMENTS_PER_WARP_32". + __repr__ = Enum.__str__ + + +class DeviceProperties(typing.NamedTuple): + """Copy device properties into a data structure not requiring torch to be imported""" + + type: str # type: ignore[assignment] + index: int # type: ignore[assignment] + cc: int + major: Optional[int] = None + regs_per_multiprocessor: Optional[int] = None + max_threads_per_multi_processor: Optional[int] = None + multi_processor_count: Optional[int] = None + + @classmethod + def create(cls, device): + import torch + from torch._dynamo.device_interface import get_interface_for_device + + device_type = device.type if torch.version.hip is None else "hip" + device_interface = get_interface_for_device(device) + if device_type == "cuda": + props = device_interface.get_device_properties(device) + return cls( + type=device_type, + index=device.index, + cc=device_interface.get_compute_capability(device), + major=props.major, + regs_per_multiprocessor=props.regs_per_multiprocessor, + max_threads_per_multi_processor=props.max_threads_per_multi_processor, + multi_processor_count=props.multi_processor_count, + ) + return cls( + type=device_type, + index=device.index, + cc=device_interface.get_compute_capability(device), + ) + + +class HalideInputSpec(typing.NamedTuple): + ctype: str + name: str + shape: Optional[List[str]] = None + stride: Optional[List[str]] = None + offset: Optional[str] = None + alias_of: Optional[str] = None + + def bindings_type(self): + if self.ctype in ("half*", "bfloat16*"): + return "uint16_t*" # half not defined + return self.ctype + + def halide_type(self): + if self.ctype == "half*": + return "halide_type_t(halide_type_float, 16)" # half not defined + if self.ctype == "bfloat16*": + return "halide_type_t(halide_type_bfloat, 16)" # half not defined + return f"halide_type_of<{self.ctype.replace('*', '')}>()" + + def is_scalar(self): + return self.shape is None + + def is_buffer(self): + return self.shape is not None + + +class HalideMeta(typing.NamedTuple): + argtypes: List[HalideInputSpec] + target: str + scheduler: Optional[str] = None + scheduler_flags: Optional[Dict[str, Union[int, str]]] = None + cuda_device: Optional[int] = None + + def args(self): + """Command line args to pass to halide generator""" + args = [f"target={self.target}"] + if self.scheduler: + args.append(f"autoscheduler={self.scheduler}") + if self.scheduler_flags: + assert self.scheduler + for k, v in self.scheduler_flags.items(): + args.append(f"autoscheduler.{k}={v}") + return args + + def is_cuda(self): + return self.cuda_device is not None diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/runtime_utils.py b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/runtime_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..446dbc71c61d1d7f5f7892b1a0c748974d539caf --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/runtime_utils.py @@ -0,0 +1,154 @@ +# mypy: allow-untyped-defs +from __future__ import annotations + +import contextlib +import functools +import getpass +import operator +import os +import re +import tempfile + +import torch + + +def conditional_product(*args): + return functools.reduce(operator.mul, [x for x in args if x]) + + +def ceildiv(numer: int, denom: int) -> int: + return -(numer // -denom) + + +def is_power_of_2(n: int) -> bool: + """Returns whether n = 2 ** m for some integer m.""" + return n > 0 and n & n - 1 == 0 + + +def next_power_of_2(n: int) -> int: + """Return the smallest power of 2 greater than or equal to n""" + n -= 1 + n |= n >> 1 + n |= n >> 2 + n |= n >> 4 + n |= n >> 8 + n |= n >> 16 + n |= n >> 32 + n += 1 + return n + + +def get_num_bytes(*args: torch.Tensor, num_in_out_args: int = 0) -> int: + """ + Return the total number of bytes the arguments of tensor type takes. + + For in/out args, tensor sizes are counted twice: once for reading and + once for writing. + + The first num_in_out_args arguments are in out tensors. + """ + return sum( + arg.numel() * arg.element_size() * (1 + int(i < num_in_out_args)) + for i, arg in enumerate(args) + if isinstance(arg, torch.Tensor) + ) + + +def triton_config_to_hashable(cfg): + """ + Convert triton config to a tuple that can uniquely identify it. We can use + the return value as a dictionary key. + """ + items = sorted(cfg.kwargs.items()) + items.append(("num_warps", cfg.num_warps)) + items.append(("num_stages", cfg.num_stages)) + return tuple(items) + + +def validate_triton_config(cfg): + # [Note: Triton pre_hook in inductor] + # pre-hook is a lambda function, which we don't attempt to serialize. + # right now, if a pre-hook is attached to the config, it will not be saved; + # and then it won't be used when the config is loaded from cache. + # So we assert - if we do get a pre_hook, it might get ignored after caching. + assert ( + getattr(cfg, "pre_hook", None) is None + ), "triton configs with pre_hooks not supported" + + +def create_bandwidth_info_str(ms, num_gb, gb_per_s, prefix="", suffix="", color=True): + info_str = f"{prefix}{ms:.3f}ms \t{num_gb:.3f} GB \t {gb_per_s:7.2f}GB/s{suffix}" + slow = ms > 0.012 and gb_per_s < 650 + return red_text(info_str) if color and slow else info_str + + +def get_max_y_grid(): + return 65535 + + +def cache_dir() -> str: + cache_dir = os.environ.get("TORCHINDUCTOR_CACHE_DIR") + if cache_dir is None: + os.environ["TORCHINDUCTOR_CACHE_DIR"] = cache_dir = default_cache_dir() + os.makedirs(cache_dir, exist_ok=True) + return cache_dir + + +def default_cache_dir(): + sanitized_username = re.sub(r'[\\/:*?"<>|]', "_", getpass.getuser()) + return os.path.join( + tempfile.gettempdir(), + "torchinductor_" + sanitized_username, + ) + + +try: + import colorama + + HAS_COLORAMA = True +except ModuleNotFoundError: + HAS_COLORAMA = False + colorama = None # type: ignore[assignment] + + +def _color_text(msg, color): + if not HAS_COLORAMA: + return msg + + return getattr(colorama.Fore, color.upper()) + msg + colorama.Fore.RESET + + +def green_text(msg): + return _color_text(msg, "green") + + +def yellow_text(msg): + return _color_text(msg, "yellow") + + +def red_text(msg): + return _color_text(msg, "red") + + +def blue_text(msg): + return _color_text(msg, "blue") + + +def get_first_attr(obj, *attrs): + """ + Return the first available attribute or throw an exception if none is present. + """ + for attr in attrs: + if hasattr(obj, attr): + return getattr(obj, attr) + + raise AssertionError(f"{obj} does not has any of the attributes: {attrs}") + + +try: + dynamo_timed = torch._dynamo.utils.dynamo_timed # type: ignore[has-type] +except AttributeError: # Compile workers only have a mock version of torch + + @contextlib.contextmanager + def dynamo_timed(key, phase_name=None, fwd_only=True): + yield diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..1a8ff2ba2408d7fce523e8b46e4adf5d65c827ed --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py @@ -0,0 +1,542 @@ +# mypy: allow-untyped-decorators +# mypy: allow-untyped-defs +import triton +import triton.language as tl + + +# In the latest triton, math functions were shuffled around into different modules: +# https://github.com/openai/triton/pull/3172 +try: + from triton.language.extra import libdevice + + libdevice = tl.extra.libdevice # noqa: F811 + math = tl.math +except ImportError: + if hasattr(tl.extra, "cuda") and hasattr(tl.extra.cuda, "libdevice"): + libdevice = tl.extra.cuda.libdevice + math = tl.math + elif hasattr(tl.extra, "intel") and hasattr(tl.extra.intel, "libdevice"): + libdevice = tl.extra.intel.libdevice + math = tl.math + else: + libdevice = tl.math + math = tl + + +try: + from triton.language.standard import _log2 +except ImportError: + + def _log2(x): + raise NotImplementedError + + +@triton.jit +def promote_to_tensor(x): + # Addition promotes to tensor for us + return x + tl.zeros((1,), tl.int1) + + +@triton.jit +def div_floor_integer(a, b): + # NOTE: a // b is C division, but we want floor division + # Based on c10::div_floor_integer + quot = a // b + remainder = a % b + fixed = tl.where(remainder != 0, quot - 1, quot) + return tl.where((a < 0) != (b < 0), fixed, quot) + + +@triton.jit +def remainder_integer(a, b): + # NOTE: a % b matches C division, not floor division + remainder = a % b + return tl.where(remainder != 0 and ((a < 0) != (b < 0)), remainder + b, remainder) + + +@triton.jit +def is_floating(x): + return promote_to_tensor(x).dtype.is_floating() + + +@triton.jit +def _prod_accumulate(a, b): + return a * b + + +@triton.jit +def prod(input, axis): + return tl.reduce(input, axis, _prod_accumulate) + + +@triton.jit +def minimum(a, b): + mask = a < b + if is_floating(a): + mask |= a != a + return tl.where(mask, a, b) + + +@triton.jit +def maximum(a, b): + mask = a > b + if is_floating(a): + mask |= a != a + return tl.where(mask, a, b) + + +@triton.jit +def min2(a, dim): + return tl.reduce(a, dim, minimum) + + +@triton.jit +def max2(a, dim): + return tl.reduce(a, dim, maximum) + + +@triton.jit +def minimum_with_index(a_value, a_index, b_value, b_index): + mask = a_value < b_value + equal = a_value == b_value + if is_floating(a_value): + a_isnan = a_value != a_value + b_isnan = b_value != b_value + mask |= a_isnan and not b_isnan + # Consider NaNs as equal + equal |= a_isnan and b_isnan + + # Prefer lowest index if values are equal + mask |= equal & (a_index < b_index) + return tl.where(mask, a_value, b_value), tl.where(mask, a_index, b_index) + + +@triton.jit +def maximum_with_index(a_value, a_index, b_value, b_index): + mask = a_value > b_value + equal = a_value == b_value + if is_floating(a_value): + a_isnan = a_value != a_value + b_isnan = b_value != b_value + mask |= a_isnan and not b_isnan + # Consider NaNs as equal + equal |= a_isnan and b_isnan + + # Prefer lowest index if values are equal + mask |= equal & (a_index < b_index) + return tl.where(mask, a_value, b_value), tl.where(mask, a_index, b_index) + + +@triton.jit +def min_with_index(value, index, dim): + return tl.reduce((value, index), dim, minimum_with_index) + + +@triton.jit +def max_with_index(value, index, dim): + return tl.reduce((value, index), dim, maximum_with_index) + + +@triton.jit +def welford_reduce(value, mean, m2, weight, first_iteration): + if first_iteration: + new_weight = tl.full(weight.shape, 1, weight.dtype) + new_mean = value + new_m2 = tl.zeros_like(m2) + else: + delta = value - mean + new_weight = weight + 1 + new_mean = mean + delta / new_weight + new_m2 = m2 + delta * (value - new_mean) + return new_mean, new_m2, new_weight + + +@triton.jit +def welford_combine(mean_1, m2_1, weight_1, mean_2, m2_2, weight_2): + delta = mean_2 - mean_1 + new_weight = weight_1 + weight_2 + w2_over_w = tl.where(new_weight == 0.0, 0.0, weight_2 / new_weight) + return ( + mean_1 + delta * w2_over_w, + m2_1 + m2_2 + delta * delta * weight_1 * w2_over_w, + new_weight, + ) + + +@triton.jit +def welford(mean, m2, weight, dim): + return tl.reduce((mean, m2, weight), dim, welford_combine) + + +@triton.jit +def device_assert_then(cond, msg, r): + tl.device_assert(cond, msg) + return r + + +@triton.jit +def randint64(seed, offset, low, high): + r0, r1, r2, r3 = tl.randint4x(seed, offset) + r0 = r0.to(tl.uint64) + r1 = r1.to(tl.uint64) + result = r0 | (r1 << 32) + size = high - low + result = result % size.to(tl.uint64) + result = result.to(tl.int64) + low + return result + + +@triton.jit +def _any_combine(a, b): + return a | b + + +@triton.jit +def any(a, dim): + return tl.reduce(a, dim, _any_combine) + + +@triton.jit +def bucketize_binary_search( + values, # 1D tensor + offsets_ptr, + indexing_dtype, + right, # bool: if true, use intervals closed on the left; see [Note: Inductor bucketize op] + OFFSETS_SIZE: int, + BLOCK_SHAPE, # tuple/list of block shape +): + """ + See [Note: Inductor bucketize op] + """ + + low = tl.zeros(BLOCK_SHAPE, dtype=indexing_dtype) + high = tl.full(BLOCK_SHAPE, OFFSETS_SIZE, dtype=indexing_dtype) + + full_range = OFFSETS_SIZE + 1 + while full_range > 1: + mid = (high + low) // 2 + mask = mid < OFFSETS_SIZE + bucket_upper_bound = tl.load(offsets_ptr + mid, mask=mask, other=0.0) + if right: + is_above = values >= bucket_upper_bound + else: + is_above = values > bucket_upper_bound + + low = tl.where(is_above & mask, mid + 1, low) + high = tl.where(is_above, high, mid) + + full_range = (full_range + 1) // 2 + + return low + + +@triton.jit +def pack_value_flag( + value, + flag, + DTYPE_VALUE_AS_UINT: tl.constexpr, + DTYPE_PACK: tl.constexpr, +): + # Workaround for triton bug, tensor.to doesn't unwrap constexpr values + DTYPE_VALUE_AS_UINT = tl.core._constexpr_to_value(DTYPE_VALUE_AS_UINT) + bitwidth = DTYPE_VALUE_AS_UINT.primitive_bitwidth + uv = value.to(DTYPE_VALUE_AS_UINT, bitcast=True).to(DTYPE_PACK) + return flag.to(DTYPE_PACK) | (uv << bitwidth) + + +@triton.jit +def unpack_value( + pack, + DTYPE_VALUE, + DTYPE_VALUE_AS_UINT, +): + # Workaround for triton bug, tensor.to doesn't unwrap constexpr values + DTYPE_VALUE = tl.core._constexpr_to_value(DTYPE_VALUE) + DTYPE_VALUE_AS_UINT = tl.core._constexpr_to_value(DTYPE_VALUE_AS_UINT) + bitwidth = DTYPE_VALUE_AS_UINT.primitive_bitwidth + value_uint = (pack >> bitwidth).to(DTYPE_VALUE_AS_UINT) + return value_uint.to(DTYPE_VALUE, bitcast=True) + + +@triton.jit +def unpack_flag(pack, DTYPE_FLAG): + return pack.to(DTYPE_FLAG) + + +@triton.jit +def exclusive_scan_decoupled_lookback( + scratch_base, + block_value, + index, + combine_fn, + DTYPE_VALUE_AS_UINT: tl.constexpr, + DTYPE_PACK: tl.constexpr, +): + """Compute exclusive scan of a scalar value between blocks + + Ref: https://research.nvidia.com/publication/2016-03_single-pass-parallel-prefix-scan-decoupled-look-back + + scratch_base: Pointer to scratch space in global memory + block_value: Scalar value for this block + index: Scalar index of this block relative to the current scan + combine_fn: Function ``(value, value) -> value`` which is scanned over + DTYPE_VALUE_AS_UINT: A tl.uint{n} type equal in size to ``block_value`` + DTYPE_PACK: Unsigned type twice the width of block_value + + NOTE: This function is limited to values which are 32-bits or less because + we need to pack (value, flag) into a single unsigned int. + """ + # Publish block sum so subsequent blocks don't get stuck waiting for us + DTYPE_VALUE = block_value.dtype + pack = pack_value_flag( + block_value, + tl.full(block_value.shape, 1, DTYPE_VALUE_AS_UINT), + DTYPE_VALUE_AS_UINT, + DTYPE_PACK, + ) + if index > 0: + tl.atomic_xchg(scratch_base + index, pack, sem="relaxed") + + # Calculate exclusive prefix scan + exclusive_prefix = tl.zeros([], DTYPE_VALUE) + prefix_valid = False + test_target = index - 1 + while test_target >= 0: + # tl.atomic_load + flag = tl.full([], 0, DTYPE_VALUE_AS_UINT) + while flag == 0: + pack = tl.atomic_add(scratch_base + test_target, 0, sem="relaxed") + flag = unpack_flag(pack, DTYPE_VALUE_AS_UINT) + + value = unpack_value(pack, DTYPE_VALUE, DTYPE_VALUE_AS_UINT) + if prefix_valid: + exclusive_prefix = combine_fn(value, exclusive_prefix) + else: + exclusive_prefix = value + prefix_valid = True + + if flag == 2: + test_target = -1 + else: + test_target = test_target - 1 + + # Make inclusive block sum visible to other blocks + if prefix_valid: + inclusive_prefix = combine_fn(exclusive_prefix, block_value) + else: + inclusive_prefix = block_value + pack = pack_value_flag( + inclusive_prefix, + tl.full([], 2, DTYPE_VALUE_AS_UINT), + DTYPE_VALUE_AS_UINT, + DTYPE_PACK, + ) + tl.atomic_xchg(scratch_base + index, pack, sem="relaxed") + return exclusive_prefix + + +@triton.jit +def exclusive_scan_decoupled_lookback_64(scratch_base, block_value, index, combine_fn): + """Compute exclusive scan of a scalar value between blocks + + Ref: https://research.nvidia.com/publication/2016-03_single-pass-parallel-prefix-scan-decoupled-look-back + + scratch_base: Pointer to scratch space in global memory + block_value: Scalar value for this block, must be 64-bits wide + index: Scalar index of this block relative to the current scan + combine_fn: Function ``(value, value) -> value`` which is scanned over + init: Scalar value equal to the identiy of combine_fn + """ + # Publish block sum so subsequent blocks don't get stuck waiting for us + if index > 0: + block_value_u64 = block_value.to(tl.uint64, bitcast=True) + tl.store(scratch_base + 3 * index + 1, block_value_u64) + tl.debug_barrier() + flag_one = tl.full([], 1, tl.uint64) + tl.atomic_xchg(scratch_base + 3 * index + 0, flag_one, sem="release") + + # Calculate exclusive prefix scan + exclusive_prefix = tl.zeros([], block_value.dtype) + prefix_valid = False + test_target = index - 1 + while test_target >= 0: + flag = tl.full([], 0, tl.uint64) + while flag == 0: + flag = tl.atomic_add(scratch_base + 3 * test_target + 0, 0, sem="acquire") + + value_u64 = tl.load(scratch_base + 3 * test_target + flag.to(tl.int32)) + value = value_u64.to(block_value.dtype, bitcast=True) + if prefix_valid: + exclusive_prefix = combine_fn(value, exclusive_prefix) + else: + exclusive_prefix = value + prefix_valid = True + + if flag == 2: + test_target = -1 + else: + test_target = test_target - 1 + + # Make inclusive block sum visible to other blocks + if prefix_valid: + inclusive_prefix = combine_fn(exclusive_prefix, block_value) + else: + inclusive_prefix = block_value + inclusive_prefix_u64 = inclusive_prefix.to(tl.uint64, bitcast=True) + tl.store(scratch_base + 3 * index + 2, inclusive_prefix_u64) + tl.debug_barrier() + flag_two = tl.full([], 2, tl.uint64) + tl.atomic_xchg(scratch_base + 3 * index + 0, flag_two, sem="release") + + return exclusive_prefix + + +@triton.jit +def frexp(x): + # TODO(isuruf): use inline_asm_elementwise here + y = libdevice.ilogb(x) + 1 + exponent = tl.where(x == 0, 0, y) + mantissa = tl.where(x == 0, 0, libdevice.ldexp(x, -y)) + return mantissa, exponent + + +@triton.jit +def _compare_and_swap_with_index( + x, + idxs, + rnumel, + flip, + i: tl.constexpr, + n_dims: tl.constexpr, + stable: tl.constexpr, + descending: tl.constexpr, +): + n_outer: tl.constexpr = x.numel >> n_dims + shape: tl.constexpr = [n_outer * 2**i, 2, 2 ** (n_dims - i - 1)] + + idtype = tl.core.get_int_dtype(bitwidth=x.dtype.primitive_bitwidth, signed=True) + + y = tl.reshape(x, shape) + iy = y.to(idtype, bitcast=True) + # slice left/right with 'stride' 2**(n_dims - i - 1) + right_mask = tl.arange(0, 2)[None, :, None].to(idtype) + left_mask = (1 - right_mask).to(idtype) + ileft = tl.broadcast_to(tl.sum(iy * left_mask, 1)[:, None, :], shape) + iright = tl.broadcast_to(tl.sum(iy * right_mask, 1)[:, None, :], shape) + ileft = tl.reshape(ileft, x.shape) + iright = tl.reshape(iright, x.shape) + left = ileft.to(x.dtype, bitcast=True) + right = iright.to(x.dtype, bitcast=True) + + # idx + y_idx = tl.reshape(idxs, shape) + left_idx = tl.broadcast_to( + tl.sum(y_idx * left_mask.to(y_idx.dtype), 1)[:, None, :], shape + ) + right_idx = tl.broadcast_to( + tl.sum(y_idx * right_mask.to(y_idx.dtype), 1)[:, None, :], shape + ) + left_idx = tl.reshape(left_idx, x.shape) + right_idx = tl.reshape(right_idx, x.shape) + + # valid + if rnumel is None: + left_valid_mask = tl.full(x.shape, True, tl.int1) + right_valid_mask = tl.full(x.shape, True, tl.int1) + else: + left_valid_mask = left_idx < rnumel + right_valid_mask = right_idx < rnumel + + # actual compare-and-swap + ix = x.to(idtype, bitcast=True) + + if descending: + cond = left < right + else: + cond = left > right + + if stable: + # When stable sorting, tie break by index + cond = cond | ((left == right) & (left_idx > right_idx)) + + cond = (right_valid_mask > left_valid_mask) | ( + (right_valid_mask == left_valid_mask) & cond + ) + cond = cond ^ flip + ret = ix ^ tl.where(cond, ileft ^ iright, tl.zeros_like(ix)) + new_idxs = idxs ^ tl.where(cond, left_idx ^ right_idx, tl.zeros_like(idxs)) + + return ret.to(x.dtype, bitcast=True), new_idxs + + +@triton.jit +def _bitonic_merge_with_index( + x, + idxs, + rnumel, + stage: tl.constexpr, + alternating: tl.constexpr, + n_dims: tl.constexpr, + stable: tl.constexpr, + descending: tl.constexpr, +): + n_outer: tl.constexpr = x.numel >> n_dims + tl.static_assert(stage <= n_dims) + # flip denotes whether to re-arrange sub-sequences of elements in ascending or + # descending order. + # if flip = 00000000... then all elements will be re-arranged ascendingly at this stage + # if flip = 00110011... then all the elements will be re-arranged alternatingly (with + # a stride of 2) at this stage + if alternating: + shape: tl.constexpr = [n_outer * 2 ** (n_dims - 1 - stage), 2, 2**stage] + flip = tl.reshape( + tl.broadcast_to(tl.arange(0, 2)[None, :, None], shape), x.shape + ) + else: + flip = False + # perform `stage` rounds of `compare-and-swap` + for i in tl.static_range(stage): + x, idxs = _compare_and_swap_with_index( + x, idxs, rnumel, flip, i + (n_dims - stage), n_dims, stable, descending + ) + return x, idxs + + +@triton.jit +def sort_with_index( + x, # value + idxs, # index + rnumel, # number of elements + dim: tl.constexpr = None, + stable: tl.constexpr = tl.constexpr(False), + descending: tl.constexpr = tl.constexpr(False), +): + x, idxs = tl.broadcast(x, idxs) + # handle default dimension or check that it is the most minor dim + _dim: tl.constexpr = len(x.shape) - 1 if dim is None else dim + tl.static_assert( + _dim == len(x.shape) - 1, "only minor dimension is currently supported" + ) + # iteratively run bitonic merge-sort steps + n_dims: tl.constexpr = _log2(x.shape[_dim]) + + for i in tl.static_range(1, n_dims + 1): + x, idxs = _bitonic_merge_with_index( + x, + idxs, + rnumel, + i, + alternating=i < n_dims, + n_dims=n_dims, + stable=stable, + descending=descending, + ) + return x, idxs + + +@triton.jit +def select_one(x, mask, dim, keep_dims=False): + idtype = tl.core.get_int_dtype(x.dtype.primitive_bitwidth, signed=False) + ix = x.to(idtype, bitcast=True) + iy = tl.sum(ix * mask, dim, keep_dims=keep_dims) + return iy.to(x.dtype, bitcast=True) diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/triton_heuristics.py b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/triton_heuristics.py new file mode 100644 index 0000000000000000000000000000000000000000..ddc2f9afc8d61f61f81056c64f1b85688c0adaf3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/triton_heuristics.py @@ -0,0 +1,1807 @@ +# mypy: allow-untyped-defs +from __future__ import annotations + +import builtins +import copy +import functools +import hashlib +import inspect +import logging +import math +import operator +import os +import os.path +import re +import sys +import threading +import time +from typing import Any, Dict, List, Optional, Set, Tuple + +import torch + +from .autotune_cache import AutotuneCache +from .benchmarking import benchmarker +from .coordinate_descent_tuner import CoordescTuner +from .hints import ( + _NUM_THREADS_PER_WARP, + AutotuneHint, + DeviceProperties, + HeuristicType, + ReductionHint, + TileHint, + TRITON_MAX_BLOCK, +) +from .runtime_utils import ( + cache_dir, + ceildiv, + conditional_product, + create_bandwidth_info_str, + dynamo_timed, + get_first_attr, + get_max_y_grid, + get_num_bytes, + next_power_of_2, + triton_config_to_hashable, + validate_triton_config, +) + + +try: + import triton +except ImportError: + triton = None + +if triton is not None: + from triton import Config + from triton.compiler import CompiledKernel + from triton.runtime.autotuner import OutOfResources + from triton.runtime.jit import KernelInterface + + try: + from triton.compiler.compiler import ASTSource + except ImportError: + ASTSource = None + + try: + from triton.backends.compiler import GPUTarget + except ImportError: + GPUTarget = None +else: + Config = object + KernelInterface = object + OutOfResources = object + ASTSource = None + GPUTarget = None + +try: + autograd_profiler = torch.autograd.profiler +except AttributeError: # Compile workers only have a mock version of torch + + class autograd_profiler: # type: ignore[no-redef] + _is_profiler_enabled = False + + +log = logging.getLogger(__name__) + + +def autotune_hints_to_configs( + hints: Set[AutotuneHint], size_hints, block_size: int +) -> List[Config]: + """ + AutotuneHints can be attached to the metadata of triton kernels for providing + suggestions about what to try for autotuning. One reason to do this is if there are + some configs that are only useful in specific scenarios, in which case we can avoid + wasting compile time on autotuning unless we know we are in one of those scenarios. + + Based on those hints, this function will generate a list of additional autotuning + configs to try. + """ + xyz_options: Tuple[Tuple[int, Optional[int], Optional[int]], ...] + configs = [] + + for hint in hints: + if hint == AutotuneHint.ELEMENTS_PER_WARP_32: + if len(size_hints) == 1: + xyz_options = ((block_size // 4, None, None),) + elif len(size_hints) == 2: + xyz_options = ((block_size // 4, 1, None), (1, block_size // 4, None)) + elif len(size_hints) == 3: + xyz_options = ( + (block_size // 4, 1, 1), + (1, block_size // 4, 1), + (1, 1, block_size // 4), + ) + for xyz in xyz_options: + configs.append( + triton_config( + size_hints, + *xyz, + num_elements_per_warp=32, + ) + ) + + return configs + + +def disable_pointwise_autotuning(inductor_meta): + # Autotuning can give different benchmarking results from run to run, and + # therefore we disable autotuning when use_deterministic flag is on. + if inductor_meta.get("are_deterministic_algorithms_enabled"): + return True + return not inductor_meta.get("autotune_pointwise", True) + + +def _dump_launch_params(args, kwargs, launcher, kernel_name): + call_args = [] + call_kwargs = {} + for arg in args: + if isinstance(arg, (int, bool)): + call_args.append(str(arg)) + else: + call_args.append("T") + for k, v in kwargs.items(): + if isinstance(arg, (int, bool)): + call_kwargs[k] = v + else: + call_kwargs[k] = v + for k, v in launcher.config.kwargs.items(): + call_kwargs[k] = v + call_kwargs["num_warps"] = launcher.config.num_warps + call_kwargs["num_stages"] = launcher.config.num_stages + args_str = "" + args_str += ", ".join(call_args) + for k, v in call_kwargs.items(): + args_str += f", {k}={v}" + + abs_path = os.path.abspath(sys.argv[0]) + with open(f"{abs_path}.launch_params", "a") as f: + f.write(f"{kernel_name} | {args_str}\n") + + +class CachingAutotuner(KernelInterface): + """ + Simplified version of Triton autotuner that has no invalidation + key and caches the best config to disk to improve cold start times. + Unlike the main triton Autotuner, this version can precompile all + configs, and does not rely on the Triton JIT. + """ + + def __init__( + self, + fn, + triton_meta, # passed directly to triton + configs, + save_cache_hook, + mutated_arg_names: List[str], # see [Note: clone mutated buffers] + heuristic_type, + size_hints=None, + inductor_meta=None, # metadata not relevant to triton + custom_kernel=False, # whether the kernel is inductor-generated or custom + filename: Optional[str] = None, + ): + super().__init__() + + assert len(configs) > 0, "Non-empty TritonConfig list required for compiling" + # makes sure there are no pre-hooks on any of the triton configs + for cfg in configs: + validate_triton_config(cfg) + + self.fn = fn + self.device_props: DeviceProperties = triton_meta["device"] + self.triton_meta = { + **triton_meta, + "device": self.device_props.index, + "device_type": self.device_props.type, + } + self.inductor_meta = {} if inductor_meta is None else inductor_meta + self.save_cache_hook = save_cache_hook + self.mutated_arg_names = mutated_arg_names + self.configs = configs + self.heuristic_type = heuristic_type + self.custom_kernel = custom_kernel + self.cuda_kernel_saved = False + if log.isEnabledFor(logging.DEBUG): + log.debug( + "CachingAutotuner gets %d configs for %s", + len(self.configs), + self.fn.__name__, + ) + for c in self.configs: + log.debug(c) + + self.launchers = [] # type: ignore[var-annotated] + self.lock = threading.Lock() + if os.getenv("TRITON_CACHE_DIR") is None: + os.environ["TRITON_CACHE_DIR"] = os.path.join( + cache_dir(), + "triton", + str(self.triton_meta.get("device", 0)), + ) + log.debug("Triton cache dir: %s", os.environ["TRITON_CACHE_DIR"]) + + self.size_hints = size_hints + self.coordesc_tuner = CoordescTuner( + is_mm=False, + name=self.fn.__name__, + size_hints=size_hints, + inductor_meta=self.inductor_meta, + ) + self.filename = filename + + self.precompile_time_taken_ns = 0 + self.autotune_time_taken_ns = 0 + + def precompile(self, warm_cache_only=False): + with self.lock: + if self.launchers: + return + self.launchers = [] + compiled_binaries = [] + if not self.configs: + raise RuntimeError("No triton configs are available") + for c in self.configs: + try: + compiled_binary, launcher = self._precompile_config( + c, warm_cache_only + ) + except OutOfResources as e: + if len(self.configs) == 1: + # There are no valid Triton configs + raise e + # Skip the config if we run out of resource + continue + self.launchers.append(launcher) + compiled_binaries.append(compiled_binary) + + if len(self.launchers) == 0: + raise RuntimeError( + "No valid triton configs. Report a fatal compilation error" + ) + + seen_configs = set(self.configs) + + device_prop = self.device_props + if ( + self.inductor_meta.get("dynamic_scale_rblock", True) + and self.heuristic_type == HeuristicType.REDUCTION + and self.size_hints is not None + # Disable for AMDGPU/Intel as Triton is not ready to return n_regs for a compiled_binary. + and device_prop.type == "cuda" + and device_prop.major + and device_prop.major >= 8 + ): + assert device_prop.regs_per_multiprocessor + assert device_prop.max_threads_per_multi_processor + assert device_prop.multi_processor_count + for triton_config, compiled_binary in zip( + self.configs, compiled_binaries + ): + assert len(self.size_hints) == 2 + xblock = triton_config.kwargs.get("XBLOCK", 1) + rblock = triton_config.kwargs["RBLOCK"] + total_block = (self.size_hints[0] + xblock - 1) // xblock + nreg = getattr(compiled_binary, "n_regs", None) + if nreg is None: + continue + + # make sure rblock is not too small + if rblock <= 64: + continue + + # each SM of A100 has 65536 32-bit registers. To maximize + # the theoretical occupancy, we need run 2048 threads on each + # SM. So each thread should use no more than 65536 / 2048 + # = 32 registers. In cases where occupancy matters, and each + # thread uses too many registers, reduce RBLOCK to reduce + # the register usage. + # For kernel https://gist.github.com/shunting314/e4cccc031fe30d378b9b23c08c238cbd + # from PLBartForCausalLM, latency improve from + # 7.795ms to 4.883ms. + # + if ( + nreg + <= device_prop.regs_per_multiprocessor + // device_prop.max_threads_per_multi_processor + ): + continue + + nreg_per_warp = nreg * 32 + nreg_per_block = nreg_per_warp * triton_config.num_warps + + # Previously we set max_blocks_per_sm to 'max_threads_per_multi_processo / (32 * num_warps)' + # The formula below is a tighter upper bound since we have the assumption that + # nreg > device_prop.regs_per_multiprocessor // device_prop.max_threads_per_multi_processor + # due to the if condition above and: + # regs_per_multiprocessor / nreg_per_block + # = regs_per_multiprocessor / (nreg * 32 * num_warps) + # < regs_per_multiprocessor / ((regs_per_multiprocessor / max_threads_per_multi_processor) * 32 * num_warps) + # = max_threads_per_multi_processor / (32 * num_warps) + # Using a tigher upper bound can reveal more optimization opportunities. + max_blocks_per_sm = max( + device_prop.regs_per_multiprocessor // nreg_per_block, 1 + ) + + if ( + total_block + <= max_blocks_per_sm * device_prop.multi_processor_count + ): + # no need to improve occupancy + continue + new_config = copy.deepcopy(triton_config) + new_config.kwargs["RBLOCK"] = rblock // 2 + if new_config in seen_configs: + continue + seen_configs.add(new_config) + log.debug( + "Dynamically scale down RBLOCK from TritonConfig(%s) and get a new TritonConfig(%s)", + triton_config, + new_config, + ) + self.launchers.append( + self._precompile_config(new_config, warm_cache_only)[1] + ) + self.configs = None + + def get_device_interface(self): + # this code cannot run in compile workers, because it imports from torch + from torch._dynamo.device_interface import get_interface_for_device + + return get_interface_for_device(self.device_props.type.replace("hip", "cuda")) + + def _precompile_config(self, cfg: Config, warm_cache_only: bool): + """Ahead of time compile a given autotuner config.""" + compile_meta = copy.deepcopy(self.triton_meta) + for k, v in cfg.kwargs.items(): + if self.device_props.type == "hip": + if k == "matrix_instr_nonkdim": + compile_meta["matrix_instr_nonkdim"] = v + continue + if k == "waves_per_eu": + compile_meta["waves_per_eu"] = v + continue + compile_meta["constants"][self.fn.arg_names.index(k)] = v + compile_meta["num_warps"] = cfg.num_warps + compile_meta["num_stages"] = cfg.num_stages + compile_meta["debug"] = self.inductor_meta.get( + "assert_indirect_indexing", True + ) and not self.inductor_meta.get("is_hip", False) + + # device type will be "hip" rather than "cuda" here + compile_meta["device_type"] = self.device_props.type + compile_meta["cc"] = self.device_props.cc + + if ASTSource: + compile_args = ( + ASTSource( + self.fn, + compile_meta["signature"], + compile_meta["constants"], + compile_meta["configs"][0], + ), + ) + + cc_str = str(compile_meta["cc"]) + if "gfx10" in cc_str or "gfx11" in cc_str: + rocm_warp_size = 32 + else: + rocm_warp_size = 64 + + if GPUTarget: + target = GPUTarget( + compile_meta["device_type"], + compile_meta["cc"], + rocm_warp_size if torch.version.hip else 32, + ) + else: + target = ( + (compile_meta["device_type"], compile_meta["cc"]) + if not torch.version.hip + else [ + compile_meta["device_type"], + compile_meta["cc"], + rocm_warp_size, + ] + ) + + options = { + "num_warps": compile_meta["num_warps"], + "num_stages": compile_meta["num_stages"], + "debug": compile_meta["debug"], + } + if self.device_props.type == "hip": + if "waves_per_eu" in compile_meta: + options["waves_per_eu"] = compile_meta["waves_per_eu"] + if "matrix_instr_nonkdim" in compile_meta: + options["matrix_instr_nonkdim"] = compile_meta[ + "matrix_instr_nonkdim" + ] + compile_kwargs = { + "target": target, + "options": options, + } + else: + compile_args = (self.fn,) + compile_kwargs = compile_meta + + if warm_cache_only: + return ( + triton.compile(*compile_args, **compile_kwargs), + None, + ) + + # importing from torch is safe now that precompile has returned + from torch._dynamo.device_interface import DeviceGuard + + device_interface = self.get_device_interface() + + # load binary to the correct device + with DeviceGuard(device_interface, compile_meta["device"]): # type: ignore[attr-defined] + # need to initialize context + device_interface.synchronize(device_interface.current_device()) + + try: + binary = triton.compile(*compile_args, **compile_kwargs) + except Exception: + log.exception( + "Triton compilation failed: %s\n%s\nmetadata: %s", + self.inductor_meta.get("kernel_name", "triton_"), + self.fn.src, + compile_meta, + ) + raise + binary._init_handles() + + call_args = [ + arg + for i, arg in enumerate(self.fn.arg_names) + if i not in self.fn.constexprs + ] + def_args = [name for name in self.fn.arg_names if name not in cfg.kwargs] + + binary_shared = ( + binary.shared if hasattr(binary, "shared") else binary.metadata.shared + ) + + scope = { + "grid_meta": cfg.kwargs, + "bin": binary, + "launch_enter_hook": CompiledKernel.launch_enter_hook, + "launch_exit_hook": CompiledKernel.launch_exit_hook, + "metadata": binary.packed_metadata + if hasattr(binary, "packed_metadata") + else binary.metadata, + "shared": binary_shared, + } + + scope["num_warps"] = ( + binary.num_warps + if hasattr(binary, "num_warps") + else binary.metadata.num_warps + ) + + scope["cta_args"] = ( + (binary.num_ctas, *get_first_attr(binary, "cluster_dims", "clusterDims")) + if hasattr(binary, "num_ctas") + else ( + (binary.metadata.num_ctas, *binary.metadata.cluster_dims) + if hasattr(binary, "metadata") + else () + ) + ) + + scope["function"] = get_first_attr(binary, "function", "cu_function") + + def get_launch_args_without_kernel_launch_metadata( + grid, + grid_0, + grid_1, + grid_2, + stream, + function, + metadata, + bin, + launch_enter_hook, + launch_exit_hook, + num_warps, + shared, + cta_args, + args, + ): + """ + Construct launch args before CompiledKernel.launch_metadata is added. + """ + return ( + grid_0, + grid_1, + grid_2, + num_warps, + *cta_args, + shared, + stream, + function, + launch_enter_hook, + launch_exit_hook, + metadata, + ) + + # Getting the kernel launch args is extremely perf-sensitive. Evaluating + # `bin.launch_metadata` is relatively expensive, and returns None unless a + # `launch_enter_hook` is installed. So if we don't have that hook installed, + # we want to burn None in to the launch args with zero overhead. + # See https://github.com/pytorch/pytorch/issues/123597 + if binary.launch_enter_hook: + + def get_launch_args_with_kernel_launch_metadata( + grid, + grid_0, + grid_1, + grid_2, + stream, + function, + metadata, + bin, + launch_enter_hook, + launch_exit_hook, + num_warps, + shared, + cta_args, + args, + ): + """ + Construct launch args after CompiledKernel.launch_metadata is added + by https://github.com/openai/triton/pull/3492 . + """ + return ( + grid_0, + grid_1, + grid_2, + stream, + function, + metadata, + bin.launch_metadata(grid, stream, *args), + launch_enter_hook, + launch_exit_hook, + ) + + else: + + def get_launch_args_with_kernel_launch_metadata( + grid, + grid_0, + grid_1, + grid_2, + stream, + function, + metadata, + bin, + launch_enter_hook, + launch_exit_hook, + num_warps, + shared, + cta_args, + args, + ): + """ + Construct launch args after CompiledKernel.launch_metadata is added + by https://github.com/openai/triton/pull/3492 . + """ + return ( + grid_0, + grid_1, + grid_2, + stream, + function, + metadata, + None, + launch_enter_hook, + launch_exit_hook, + ) + + scope["get_launch_args"] = ( + get_launch_args_with_kernel_launch_metadata + if hasattr(binary, "launch_metadata") + else get_launch_args_without_kernel_launch_metadata + ) + + scope["runner"] = get_first_attr(binary, "run", "c_wrapper") + + exec( + f""" + def launcher({', '.join(def_args)}, grid, stream): + if callable(grid): + grid_0, grid_1, grid_2 = grid(grid_meta) + else: + grid_0, grid_1, grid_2 = grid + + args = {', '.join(call_args)}, + launch_args = get_launch_args( + grid, grid_0, grid_1, grid_2, stream, function, + metadata, bin, launch_enter_hook, launch_exit_hook, + num_warps, shared, cta_args, args + ) + runner(*launch_args, *args) + return bin + """.lstrip(), + scope, + ) + + launcher = scope["launcher"] + launcher.config = cfg + launcher.n_regs = getattr(binary, "n_regs", None) + launcher.n_spills = getattr(binary, "n_spills", None) + launcher.shared = binary_shared + launcher.store_cubin = self.inductor_meta.get("store_cubin", False) + # store this global variable to avoid the high overhead of reading it when calling run + if launcher.store_cubin: + launcher.fn = self.fn + launcher.bin = binary + + return binary, launcher + + def bench(self, launcher, *args, grid, with_profiler=False, **kwargs): + """Measure the performance of a given launcher""" + # we don't skip configs wiht spilled registers when auto-tuning custom + # (user-written) Triton kernels, as (i) we don't have any knowledge or + # control over the kernel code; (ii) there is empirical evidence that + # for some (complicated) custom Triton kernels, a register-spilling + # config may yield the best latency. + if not self.custom_kernel and launcher.n_spills > self.inductor_meta.get( + "spill_threshold", 16 + ): + log.debug( + "Skip config %s because of register spilling: %d", + launcher.config, + launcher.n_spills, + ) + return float("inf") + + device_interface = self.get_device_interface() + stream = device_interface.get_raw_stream(device_interface.current_device()) + + def kernel_call(): + cloned_args, cloned_kwargs = self.clone_args(*args, **kwargs) + launcher( + *cloned_args, + **cloned_kwargs, + grid=grid, + stream=stream, + ) + + if with_profiler: + from torch._inductor.utils import do_bench_using_profiling + + return do_bench_using_profiling(kernel_call, warmup=10, rep=40) + + return benchmarker.benchmark_gpu(kernel_call, rep=40, fast_flush=True) + + def clone_args(self, *args, **kwargs) -> Tuple[List[Any], Dict[str, Any]]: + from ..compile_fx import clone_preserve_strides + + # [Note: clone mutated buffers] + # clone inplace buffers to avoid autotune contaminating them if + # the kernel does in-place stores. avoid cloning other buffers because + # it leads to increase memory use + cloned_args = [] + for i, arg in enumerate(args): + if self.fn.arg_names[i] in self.mutated_arg_names: + assert isinstance(arg, torch.Tensor) + cloned_args.append(clone_preserve_strides(arg)) + else: + cloned_args.append(arg) + + cloned_kwargs: Dict[str, Any] = {} + for name, arg in kwargs.items(): + if name in self.mutated_arg_names: + assert isinstance(arg, torch.Tensor) + cloned_kwargs[name] = clone_preserve_strides(arg) + else: + cloned_kwargs[name] = arg + + return cloned_args, cloned_kwargs + + def benchmark_all_configs(self, *args, **kwargs): + with dynamo_timed("CachingAutotuner.benchmark_all_configs"): + timings = { + launcher: self.bench(launcher, *args, **kwargs) + for launcher in self.launchers + } + + for k, v in timings.items(): + self.coordesc_tuner.cache_benchmark_result(k.config, v) + + if log.isEnabledFor(logging.DEBUG): + log.debug("Benchmark all input configs for %s, get:", self.fn.__name__) + for k, v in timings.items(): + log.debug( + "%s: %f, nreg %d, nspill %d, #shared-mem %s", + k.config, + v, + k.n_regs, + k.n_spills, + k.shared, + ) + + return timings + + def autotune_to_one_config(self, *args, **kwargs): + """Do the actual autotuning""" + start_time = time.time_ns() + timings = self.benchmark_all_configs(*args, **kwargs) + benchmark_time_taken_ns = time.time_ns() - start_time + self.launchers = [builtins.min(timings, key=timings.get)] + self.autotune_time_taken_ns = ( + self.precompile_time_taken_ns + benchmark_time_taken_ns + ) + if self.save_cache_hook: + self.save_cache_hook(self.launchers[0].config, self.autotune_time_taken_ns) + + def save_gpu_kernel(self, grid, stream, launcher): + if callable(grid): + grid_x, grid_y, grid_z = grid(launcher.config.kwargs) + else: + grid_x, grid_y, grid_z = grid + + key = self.inductor_meta.get("kernel_name", None) # unique kernel name + assert key is not None, "kernel_name can not be None" + params = { + "mangled_name": launcher.bin.metadata.name + if hasattr(launcher.bin.metadata, "name") + else launcher.bin.metadata["name"], + "grid_x": grid_x, + "grid_y": grid_y, + "grid_z": grid_z, + "x_block": launcher.config.kwargs.get("XBLOCK", 1), + "y_block": launcher.config.kwargs.get("YBLOCK", None), + "z_block": launcher.config.kwargs.get("ZBLOCK", None), + "num_warps": launcher.bin.num_warps + if hasattr(launcher.bin, "num_warps") + else launcher.bin.metadata.num_warps, + "shared_mem": launcher.bin.shared + if hasattr(launcher.bin, "shared") + else launcher.bin.metadata.shared, + "stream": stream, + # User defined triton kernels will have arbitrary kwarg names + "meta": launcher.config.kwargs, + } + from torch._inductor.codecache import CudaKernelParamCache + + bin_type = {"hip": "hsaco", "xpu": "spv"}.get(self.device_props.type, "cubin") + binary = launcher.bin.asm[bin_type] + CudaKernelParamCache.set(key, params, binary, bin_type) + + self.cuda_kernel_saved = True + + def coordinate_descent_tuning(self, launcher, *args, **kwargs): + """ + Coordinate descent tuning can be run with or without max-autotune. + + The only difference between these two is the starting config for coordinate_descent tuning. + E.g., assuming regular autotune only get one config C1; while max-autotune get 4 configs C1, C2, C3, C4 + and max-autotune figure out C3 is the best. + + Then if coordinate desecnt tuning is run with max-autotune disabled, it will start from C1; + while if coordinate descent tuning is run with max-autotune enabled, it will start from C3. + """ + if ( + self.heuristic_type == HeuristicType.TEMPLATE + or self.heuristic_type == HeuristicType.USER_AUTOTUNE + ): + # skip triton template + return launcher + + config2launcher = {launcher.config: launcher} + + def benchmark_one_config(config): + with self.lock: + _, launcher = self._precompile_config(config, False) + config2launcher[config] = launcher + + out = self.bench(launcher, *args, **kwargs) + log.debug( + "COORDESC: %s: %f, nreg %d, nspill %d, #shared-mem %d", + launcher.config, + out, + launcher.n_regs, + launcher.n_spills, + launcher.shared, + ) + return out + + assert not ( + self.heuristic_type == HeuristicType.PERSISTENT_REDUCTION + and "RBLOCK" in launcher.config.kwargs + ), "Coordinate descent tuner relies on the assumption that persistent reduction's triton config does not have RBLOCK" + start_time = time.time_ns() + best_config = self.coordesc_tuner.autotune( + benchmark_one_config, launcher.config, None + ) + coordesc_time_taken_ns = time.time_ns() - start_time + best_config.found_by_coordesc = True + + if self.save_cache_hook: + self.save_cache_hook( + best_config, + self.autotune_time_taken_ns + coordesc_time_taken_ns, + found_by_coordesc=True, + ) + return config2launcher.get(best_config) + + def run(self, *args, grid, stream, **kwargs): + if len(self.launchers) != 1: + if len(self.launchers) == 0: + start_time = time.time_ns() + self.precompile() + self.precompile_time_taken_ns = time.time_ns() - start_time + if len(self.launchers) > 1: + self.autotune_to_one_config(*args, grid=grid, **kwargs) + + if not getattr( + self.launchers[0].config, "found_by_coordesc", False + ) and self.inductor_meta.get("coordinate_descent_tuning", False): + self.launchers = [ + self.coordinate_descent_tuning( + self.launchers[0], *args, grid=grid, **kwargs + ) + ] + + (launcher,) = self.launchers + if launcher.store_cubin: + self.save_gpu_kernel(grid, stream, launcher) + + if os.environ.get("TORCHINDUCTOR_DUMP_LAUNCH_PARAMS", 0) == "1": + _dump_launch_params(args, kwargs, launcher, self.fn.__name__) + + # it is faster than entering and exiting a context manager, even if the context + # manager is a nullcontext. + if autograd_profiler._is_profiler_enabled: + # grid can be a tuple of ints or a string. + if isinstance(grid, tuple): + grid_info = str(grid) + else: + grid_info = getattr(grid, "grid_fn_str", "") + with torch._C._profiler._RecordFunctionFast( + self.inductor_meta.get("kernel_name", "triton kernel"), + args, + { + "kernel_file": "" if self.filename is None else self.filename, + "kernel_backend": "triton", + "grid": grid_info, + "stream": stream, + }, + ): + return launcher( + *args, + **kwargs, + grid=grid, + stream=stream, + ) + else: + return launcher( + *args, + **kwargs, + grid=grid, + stream=stream, + ) + + +def _find_names(obj): + import gc + import inspect + + frame = inspect.currentframe() + while frame is not None: + frame.f_locals + frame = frame.f_back + obj_names = [] + for referrer in gc.get_referrers(obj): + if isinstance(referrer, dict): + for k, v in referrer.items(): + if v is obj: + obj_names.append(k) + return obj_names + + +collected_calls: List[Any] = [] + + +def start_graph(): + collected_calls.clear() + + +def end_graph(output_file): + if len(collected_calls) == 0: + return + overall_time = sum(call[0] for call in collected_calls) + overall_gb = sum(call[1] for call in collected_calls) + cur_file = inspect.stack()[1].filename + summary_str = ( + f"SUMMARY ({cur_file})\n" + f"{overall_time:.2f}ms \t {overall_gb:.2f} GB\t {overall_gb/(overall_time/1e3):.2f}GB/s" + ) + print(summary_str) + print() + if output_file is not None: + # sort perf numbers in descending order, i.e. placing the + # most runtime-heavy kernels at the top of the list + sorted_calls = sorted(collected_calls, key=lambda c: float(c[0]), reverse=True) + try: + with open(output_file, "a") as file: + log.debug("Save profile bandwidth results to %s", output_file) + file.write("====================\n") + file.write(f"TRITON KERNELS BANDWIDTH INFO ({cur_file})\n") + for ms, num_gb, gb_per_s, kernel_name in sorted_calls: + # also display the runtime percentage for each kernel + percentage = f"{ms/overall_time*100:.2f}%" + suffix = f" \t {percentage} \t {kernel_name}" + bw_info_str = create_bandwidth_info_str( + ms, + num_gb, + gb_per_s, + suffix=suffix, + color=False, + ) + file.write(bw_info_str + "\n") + file.write(f"{summary_str}\n\n") + except Exception as e: + log.warning( + "failed to write profile bandwidth result into %s: %s", + output_file, + e, + ) + + +class DebugAutotuner(CachingAutotuner): + def __init__(self, *args, regex_filter="", with_profiler=False, **kwargs): + self.regex_filter = regex_filter + self.with_profiler = with_profiler + super().__init__(*args, **kwargs) + self.cached = None + + def run(self, *args, grid, stream): + possible_names = _find_names(self) + kernel_name = f"{max(possible_names, key=len)}" + if not re.match(self.regex_filter, kernel_name): + return + super().run(*args, grid=grid, stream=stream) + (launcher,) = self.launchers + + if self.cached is None: + ms = self.bench( + launcher, *args, grid=grid, with_profiler=self.with_profiler + ) + num_in_out_ptrs = len( + [ + arg_name + for arg_name in self.fn.arg_names + if arg_name.startswith("in_out_ptr") + ] + ) + num_gb = self.inductor_meta.get("kernel_num_gb", None) + if num_gb is None: + num_gb = get_num_bytes(*args, num_in_out_args=num_in_out_ptrs) / 1e9 + gb_per_s = num_gb / (ms / 1e3) + self.cached = ms, num_gb, gb_per_s, kernel_name + collected_calls.append((ms, num_gb, gb_per_s, kernel_name)) + print( + create_bandwidth_info_str( + ms, num_gb, gb_per_s, suffix=f" \t {kernel_name}" + ) + ) + + +def hash_configs(configs: List[Config]): + """ + Hash used to check for changes in configurations + """ + hasher = hashlib.sha256() + for cfg in configs: + hasher.update( + f"{sorted(cfg.kwargs.items())} {cfg.num_warps} {cfg.num_stages}\n".encode() + ) + return hasher.hexdigest() + + +def cached_autotune( + size_hints: Optional[List[int]], + configs: List[Config], + triton_meta, + heuristic_type, + filename=None, + inductor_meta=None, + custom_kernel=False, +): + """ + A copy of triton.autotune that calls our subclass. Our subclass + has additional debugging, error handling, and on-disk caching. + """ + configs = unique_configs(configs) + assert len(configs) == 1 or filename + inductor_meta = {} if inductor_meta is None else inductor_meta + + disabled = inductor_meta.get("force_disable_caches", False) + + # on disk caching logic and/or remote caching + autotune_cache = None + if ( + not disabled + and filename is not None + and (len(configs) > 1 or inductor_meta.get("coordinate_descent_tuning")) + ): + configs_hash = hash_configs(configs) + + autotune_cache = AutotuneCache.create(inductor_meta, filename, configs_hash) + if autotune_cache: + if best_config := autotune_cache.read_best(inductor_meta, configs): + configs = [best_config] + + else: + if disabled: + log.debug("autotune caching is disabled by config.force_disable_caches") + + mutated_arg_names = inductor_meta.pop("mutated_arg_names", ()) + + def decorator(fn): + # Remove XBLOCK from config if it's not a function argument. + # This way, coordinate descent tuning will not try to tune it. + # + # Context: When TritonKernel.no_x_dim is True, we hardcode XBLOCK to 1. + import inspect + + if "XBLOCK" not in inspect.signature(fn.fn).parameters: + for tconfig in configs: + if "XBLOCK" in tconfig.kwargs: + assert tconfig.kwargs["XBLOCK"] == 1 + tconfig.kwargs.pop("XBLOCK") + + if inductor_meta.get("profile_bandwidth"): + return DebugAutotuner( + fn, + triton_meta=triton_meta, + inductor_meta=inductor_meta, + regex_filter=inductor_meta["profile_bandwidth_regex"], + with_profiler=inductor_meta[ + "profile_bandwidth_with_do_bench_using_profiling" + ], + configs=configs, + save_cache_hook=autotune_cache and autotune_cache.save, + mutated_arg_names=mutated_arg_names, + heuristic_type=heuristic_type, + size_hints=size_hints, + custom_kernel=custom_kernel, + filename=filename, + ) + return CachingAutotuner( + fn, + triton_meta=triton_meta, + inductor_meta=inductor_meta, + configs=configs, + save_cache_hook=autotune_cache and autotune_cache.save, + mutated_arg_names=mutated_arg_names, + heuristic_type=heuristic_type, + size_hints=size_hints, + custom_kernel=custom_kernel, + filename=filename, + ) + + return decorator + + +def unique_configs(configs: List[Config]): + """Remove duplicate configurations""" + seen = set() + pruned_configs = [] + + for cfg in configs: + key = triton_config_to_hashable(cfg) + if key not in seen: + seen.add(key) + pruned_configs.append(cfg) + return pruned_configs + + +def check_config(cfg, *, xnumel=None, ynumel=None, znumel=None): + for numel, label in zip((xnumel, ynumel, znumel), "XYZ"): + if numel is None: + continue + block = cfg[f"{label}BLOCK"] + if numel == 1: + assert block == 1, ( + f"TritonKernel.indexing assumes numel == 1 => BLOCK == 1" + f" but {label.lower()}numel=={numel} and {label}BLOCK={block} (cfg={cfg})." + ) + max_block = TRITON_MAX_BLOCK[label] + max_block_str = f'config.triton.max_block["{label}"]' + assert max_block % block == 0, ( + f"TritonKernel.indexing assumes {label}BLOCK divides {max_block_str}" + f" but {label}BLOCK={block} and {max_block_str}={max_block} (cfg={cfg})." + ) + + +def _num_warps(num_warps, max_num_warps=8, min_num_warps=2, register_intensive=False): + # On AMD GPU each warp has 64 lanes which is double the size on NV GPU, + # therefore using half the number of warps here correspondingly. + if torch.version.hip: + max_num_warps = (max_num_warps + 1) // 2 + min_num_warps = (min_num_warps + 1) // 2 + # persistent reduction is register intensive + if register_intensive: + max_num_warps = max_num_warps // 2 + return next_power_of_2(min(max(num_warps, min_num_warps), max_num_warps)) + + +def _check_max_grid_x(size_hints, x, num_warps): + # Check if maxGridSize is exceeded - if so then must scale XBLOCK further + max_grid_x = 2147483647 + warp_size = ( + 64 if torch.version.hip else 32 + ) # TODO: query warp size once #129663 is merged + num_blocks = (size_hints[0] + x - 1) // x + + while (num_blocks * num_warps * warp_size) > max_grid_x and x < size_hints[0]: + x *= 2 # Scale up XBLOCK if grid exceeds limits + num_blocks = num_blocks // 2 + if x >= max_grid_x: + raise AssertionError( + "Reduction config exceeds cudaDeviceProp maxGridSize. Please raise a pytorch issue" + ) + return x, num_blocks + + +def triton_config( + size_hints, + x, + y=None, + z=None, + num_stages=1, + num_elements_per_warp=256, + min_elem_per_thread=0, +) -> Config: + """ + Construct a pointwise triton config with some adjustment heuristics + based on size_hints. Size_hints is a tuple of numels in each tile + dimension and will be rounded up to the nearest power of 2. + + num_elements_per_warp is a suggestion for controlling how many warps + the triton config should contain. e.g.: if x=16, y=8, z=4 then + num_elements = 16*8*4 = 512. Then if we set num_elements_per_warp=128, + we'll launch 512 (elem) / 128 (elem/warp) = 4 warps. Note that it's + just a suggestion, and sometimes other adjustment heuristics will + override the num_elements_per_warp. + + min_elem_per_thread controls the minimum number of elements + processed by each thread. It's always enforced. + """ + # Ideally we want to read this from some device config + + # for a 2d size_hints [a, b], a should be mapped to YBLOCK rather than XBLOCK + size_hints = list(reversed(size_hints)) + + maxGridSize = [2147483647, 65535, 65535] + + target = conditional_product(x, y, z) + if conditional_product(*size_hints) < target: + target //= 8 + + # shrink sizes to size hints + x = min(x, size_hints[0]) + if y: + y = min(y, size_hints[1]) + if z: + z = min(z, size_hints[2]) + + # if we are below original block size, scale up where we can; + # or if the calculated grid size is larger than the limit, we bump up the corresponding dimension + while x < min(size_hints[0], TRITON_MAX_BLOCK["X"]) and ( + x * maxGridSize[0] < size_hints[0] or conditional_product(x, y, z) < target + ): + x *= 2 + while ( + y + and y < min(size_hints[1], TRITON_MAX_BLOCK["Y"]) + and ( + y * maxGridSize[1] < size_hints[1] or conditional_product(x, y, z) < target + ) + ): + y *= 2 + while ( + z + and z < min(size_hints[2], TRITON_MAX_BLOCK["Z"]) + and ( + z * maxGridSize[2] < size_hints[2] or conditional_product(x, y, z) < target + ) + ): + z *= 2 + + num_warps = _num_warps( + conditional_product(x, y, z) // num_elements_per_warp, min_num_warps=1 + ) + # we are going to arrive at 2 warps only if bs was too small due to + # numel being too small. However to workaround some ptx bugs we still + # want at least 4 warps if there's enough elements per thread + # given that this is a rare situation, don't expect this to affect perf + # in general + # see https://github.com/pytorch/pytorch/pull/97950 + if conditional_product(x, y, z) >= 128 and not torch.version.hip: + num_warps = max(num_warps, 4) + xnumel = size_hints[0] + ynumel = size_hints[1] if y else None + znumel = size_hints[2] if z else None + + # Increase x to satisfy min_elem_per_thread requirements. + block_size = max( + conditional_product(x, y, z), + min_elem_per_thread * _NUM_THREADS_PER_WARP * num_warps, + ) + x *= math.ceil(block_size / conditional_product(x, y, z)) + + x, _num_blocks = _check_max_grid_x(size_hints, x, num_warps) + + cfg = {"XBLOCK": x} + if y: + cfg["YBLOCK"] = y + if z: + cfg["ZBLOCK"] = z + assert x <= TRITON_MAX_BLOCK["X"], f"increase TRITON_MAX_BLOCK['X'] to {x}" + check_config(cfg, xnumel=xnumel, ynumel=ynumel, znumel=znumel) + return Config(cfg, num_warps=num_warps, num_stages=num_stages) + + +def triton_config_reduction( + size_hints, x, r, num_stages=1, num_warps=None, register_intensive=False +) -> Config: + """ + Construct a reduction triton config with some adjustment heuristics + based on size_hints. Size_hints is a tuple of numels in each tile + dimension and will be rounded up to the nearest power of 2. + """ + + target = conditional_product(x, r) + if conditional_product(*size_hints) < target: + target //= 8 + + # shrink sizes to size hints + x = min(x, size_hints[0]) + r = min(r, size_hints[1]) + + # if we are below original block size, scale up where we can + while x < size_hints[0] and conditional_product(x, r) < target: + x *= 2 + while r < size_hints[1] and conditional_product(x, r) < target: + r *= 2 + + if num_warps is None: + num_warps = conditional_product(x, r) // 128 + num_warps = _num_warps( + num_warps, max_num_warps=16, register_intensive=register_intensive + ) + + x, _num_blocks = _check_max_grid_x(size_hints, x, num_warps) + + while conditional_product(x, r) > target: + if r == 1: + break + r = r // 2 + + cfg = {"XBLOCK": x, "RBLOCK": r} + check_config(cfg, xnumel=size_hints[0]) + assert x <= TRITON_MAX_BLOCK["X"], f"increase TRITON_MAX_BLOCK['X'] to {x}" + assert r <= TRITON_MAX_BLOCK["R"], f"increase TRITON_MAX_BLOCK['r'] to {r}" + return Config(cfg, num_warps=num_warps, num_stages=num_stages) + + +def triton_config_tiled_reduction(size_hints, x, y, r, num_stages=1): + """ + Construct a tile reduction triton config with some adjustment + heuristics based on size_hints. Size_hints is a tuple of numels in + each tile dimension and will be rounded up to the nearest power of 2. + """ + + target = conditional_product(x, y, r) + if conditional_product(*size_hints) < target: + target //= 8 + + # shrink sizes to size hints + x = min(x, size_hints[0]) + y = min(y, size_hints[1]) + r = min(r, size_hints[2]) + + # if we are below original block size, scale up where we can + while x < size_hints[0] and conditional_product(x, y, r) < target: + x *= 2 + while r < size_hints[2] and conditional_product(x, y, r) < target: + r *= 2 + while y < size_hints[1] and conditional_product(x, y, r) < target: + y *= 2 + + cfg = {"XBLOCK": x, "YBLOCK": y, "RBLOCK": r} + num_warps = _num_warps(conditional_product(x, y, r) // 256, min_num_warps=1) + check_config(cfg, xnumel=size_hints[0], ynumel=size_hints[1]) + assert r <= TRITON_MAX_BLOCK["R"], f"increase TRITON_MAX_BLOCK['r'] to {r}" + return Config(cfg, num_warps=num_warps, num_stages=num_stages) + + +def pointwise( + size_hints, + triton_meta, + tile_hint=None, + filename=None, + min_elem_per_thread=0, + inductor_meta=None, +): + """ + Construct @triton.heuristics() based on size_hints. + """ + inductor_meta = {} if inductor_meta is None else inductor_meta + assert not inductor_meta.get("no_x_dim") + + numel = functools.reduce(operator.mul, size_hints) + bs = max(256, min(numel // 128, 1024)) + + hinted_configs = autotune_hints_to_configs( + inductor_meta.get("autotune_hints", set()), size_hints, bs + ) + + triton_config_with_settings = functools.partial( + triton_config, min_elem_per_thread=min_elem_per_thread + ) + + if len(size_hints) == 1: + if disable_pointwise_autotuning(inductor_meta) and not ( + inductor_meta.get("max_autotune") + or inductor_meta.get("max_autotune_pointwise") + ): + return cached_autotune( + size_hints, + [triton_config_with_settings(size_hints, bs)], + triton_meta=triton_meta, + inductor_meta=inductor_meta, + heuristic_type=HeuristicType.POINTWISE, + filename=filename, + ) + else: + return cached_autotune( + size_hints, + [ + triton_config_with_settings( + size_hints, bs, num_elements_per_warp=256 + ), + triton_config_with_settings( + size_hints, bs // 2, num_elements_per_warp=64 + ), + *hinted_configs, + ], + triton_meta=triton_meta, + inductor_meta=inductor_meta, + heuristic_type=HeuristicType.POINTWISE, + filename=filename, + ) + if len(size_hints) == 2: + if ( + disable_pointwise_autotuning(inductor_meta) or tile_hint == TileHint.SQUARE + ) and not ( + inductor_meta.get("max_autotune") + or inductor_meta.get("max_autotune_pointwise") + ): + return cached_autotune( + size_hints, + [triton_config_with_settings(size_hints, 32, 32)], + triton_meta=triton_meta, + inductor_meta=inductor_meta, + heuristic_type=HeuristicType.POINTWISE, + filename=filename, + ) + return cached_autotune( + size_hints, + [ + triton_config_with_settings(size_hints, 32, 32), + triton_config_with_settings(size_hints, 64, 64), # ~8% better for fp16 + triton_config_with_settings(size_hints, 256, 16), + triton_config_with_settings(size_hints, 16, 256), + triton_config_with_settings(size_hints, bs, 1), + triton_config_with_settings(size_hints, 1, bs), + *hinted_configs, + ], + triton_meta=triton_meta, + inductor_meta=inductor_meta, + filename=filename, + heuristic_type=HeuristicType.POINTWISE, + ) + if len(size_hints) == 3: + if disable_pointwise_autotuning(inductor_meta): + return cached_autotune( + size_hints, + [triton_config_with_settings(size_hints, 16, 16, 16)], + triton_meta=triton_meta, + inductor_meta=inductor_meta, + heuristic_type=HeuristicType.POINTWISE, + filename=filename, + ) + return cached_autotune( + size_hints, + [ + triton_config_with_settings(size_hints, 16, 16, 16), + triton_config_with_settings(size_hints, 64, 8, 8), + triton_config_with_settings(size_hints, 8, 64, 8), + triton_config_with_settings(size_hints, 8, 8, 64), + triton_config_with_settings(size_hints, bs, 1, 1), + triton_config_with_settings(size_hints, 1, bs, 1), + triton_config_with_settings(size_hints, 1, 1, bs), + *hinted_configs, + ], + triton_meta=triton_meta, + inductor_meta=inductor_meta, + filename=filename, + heuristic_type=HeuristicType.POINTWISE, + ) + raise NotImplementedError(f"size_hints: {size_hints}") + + +def _reduction_configs( + *, size_hints: List[int], inductor_meta: Dict[str, Any] +) -> List[Config]: + reduction_hint = inductor_meta.get("reduction_hint", None) + assert len(size_hints) == 2 + rnumel = size_hints[-1] + + register_intensive = False + MAX_RBLOCK = 2048 + if ( + size_hints[0] >= 1024 + and inductor_meta.get("num_load", 0) + inductor_meta.get("num_reduction", 0) + >= 10 + ): + # A heuristics to reduce RBLOCK if a kernel potentially need many registers. + # Consider load and reduction since load need move data into registers and + # reduction needs an accumulator. + # + # The magic numbers are a bit arbitrary. + # + # We cannot rely on dynamically scaling down RBLOCK later, since sometimes + # triton makes it to use less registers with worse perf. Check: + # https://github.com/pytorch/pytorch/issues/126463 + # + # The heuristic is a very simple one since registers can be reused. But + # hopefully it can be a good enough indicator. + MAX_RBLOCK = 1024 + register_intensive = True + + contiguous_config = triton_config_reduction( + size_hints, + 1, + (rnumel if 256 <= rnumel < MAX_RBLOCK else MAX_RBLOCK), + register_intensive=register_intensive, + ) + outer_config = triton_config_reduction( + size_hints, 64, 8, register_intensive=register_intensive + ) + tiny_config = triton_config_reduction( + size_hints, + 2 * (256 // rnumel) if rnumel <= 256 else 1, + min(rnumel, MAX_RBLOCK), + register_intensive=register_intensive, + ) + if inductor_meta.get("max_autotune") or inductor_meta.get("max_autotune_pointwise"): + pass # skip all these cases + elif reduction_hint == ReductionHint.INNER: + return [contiguous_config] + elif reduction_hint == ReductionHint.OUTER: + return [outer_config] + elif reduction_hint == ReductionHint.OUTER_TINY: + return [tiny_config] + if disable_pointwise_autotuning(inductor_meta): + return [triton_config_reduction(size_hints, 32, 128)] + return [ + contiguous_config, + outer_config, + tiny_config, + triton_config_reduction(size_hints, 64, 64), + triton_config_reduction(size_hints, 8, 512), + # halve the XBLOCK/RBLOCK compared to outer_config + # TODO: this may only be beneficial when each iteration of the reduction + # is quite heavy. E.g. https://gist.github.com/shunting314/189a8ef69f90db9d614a823385147a72 + triton_config_reduction(size_hints, 64, 4, num_warps=8), + ] + + +def reduction( + size_hints, + reduction_hint=False, + triton_meta=None, + filename=None, + inductor_meta=None, +): + """args to @triton.heuristics()""" + inductor_meta = {} if inductor_meta is None else inductor_meta + inductor_meta["reduction_hint"] = reduction_hint + if inductor_meta.get("no_x_dim"): + size_hints = [1, *size_hints[1:]] + + assert triton_meta is not None + rnumel = size_hints[-1] + if len(size_hints) != 2: + raise NotImplementedError(f"size_hints: {size_hints}") + + configs = _reduction_configs(size_hints=size_hints, inductor_meta=inductor_meta) + return cached_autotune( + size_hints, + configs=configs, + triton_meta=triton_meta, + inductor_meta=inductor_meta, + heuristic_type=HeuristicType.REDUCTION, + filename=filename, + ) + + +def persistent_reduction( + size_hints, + reduction_hint=False, + triton_meta=None, + filename=None, + inductor_meta=None, +): + inductor_meta = {} if inductor_meta is None else inductor_meta + inductor_meta["reduction_hint"] = reduction_hint + if inductor_meta.get("no_x_dim"): + size_hints = [1, *size_hints[1:]] + + xnumel, rnumel = size_hints + + configs = [ + triton_config_reduction(size_hints, xblock, rnumel, register_intensive=True) + for xblock in (1, 8, 32, 128) + if xblock == 1 or (rnumel * xblock <= 4096 and xblock <= xnumel) + ] + + # TODO(jansel): we should be able to improve these heuristics + if reduction_hint == ReductionHint.INNER and rnumel >= 256: + configs = configs[:1] + elif reduction_hint == ReductionHint.OUTER: + configs = configs[-1:] + elif reduction_hint == ReductionHint.OUTER_TINY: + configs = [ + triton_config_reduction( + size_hints, 2 * (256 // rnumel) if rnumel <= 256 else 1, rnumel + ) + ] + for c in configs: + # we don't need RBLOCK for persistent reduction + c.kwargs.pop("RBLOCK") + + if disable_pointwise_autotuning(inductor_meta): + configs = configs[:1] + + return cached_autotune( + size_hints, + configs, + triton_meta=triton_meta, + inductor_meta=inductor_meta, + filename=filename, + heuristic_type=HeuristicType.PERSISTENT_REDUCTION, + ) + + +def split_scan( + size_hints, + reduction_hint=False, + triton_meta=None, + filename=None, + inductor_meta=None, +): + """Heuristic for TritonSplitScanKernel""" + inductor_meta = {} if inductor_meta is None else inductor_meta + inductor_meta["reduction_hint"] = reduction_hint + if inductor_meta.get("no_x_dim"): + size_hints = [1, *size_hints[1:]] + + assert triton_meta is not None + if len(size_hints) != 2: + raise NotImplementedError(f"size_hints: {size_hints}") + + configs = _reduction_configs(size_hints=size_hints, inductor_meta=inductor_meta) + + # Fixup configs to enforce the minimum RBLOCK size + min_rblock = inductor_meta.get("min_split_scan_rblock", 256) + for cfg in configs: + if cfg.kwargs["RBLOCK"] < min_rblock: + cfg.kwargs["RBLOCK"] = min_rblock + + return cached_autotune( + size_hints, + configs=configs, + triton_meta=triton_meta, + inductor_meta=inductor_meta, + heuristic_type=HeuristicType.SPLIT_SCAN, + filename=filename, + ) + + +def template(num_stages, num_warps, triton_meta, filename=None, inductor_meta=None): + """ + Compile a triton template + """ + return cached_autotune( + None, + [triton.Config({}, num_stages=num_stages, num_warps=num_warps)], + triton_meta=triton_meta, + inductor_meta=inductor_meta, + heuristic_type=HeuristicType.TEMPLATE, + filename=filename, + ) + + +def user_autotune( + configs, triton_meta, filename=None, inductor_meta=None, custom_kernel=False +): + """ + Compile a user defined triton kernel + """ + defaults = inspect.signature(triton.Config).parameters + default_num_stages = defaults["num_stages"].default + default_num_warps = defaults["num_warps"].default + + if len(configs) == 0: + configs = [ + triton.Config( + {}, num_stages=default_num_stages, num_warps=default_num_warps + ) + ] + else: + configs = [ + triton.Config( + c.get("kwargs", {}), + num_stages=c.get("num_stages", default_num_stages), + num_warps=c.get("num_warps", default_num_warps), + ) + for c in configs + ] + + return cached_autotune( + None, + configs, + triton_meta=triton_meta, + heuristic_type=HeuristicType.USER_AUTOTUNE, + filename=filename, + inductor_meta=inductor_meta, + custom_kernel=custom_kernel, + ) + + +def foreach(triton_meta, num_warps, filename=None, inductor_meta=None): + """ + Compile a triton foreach kernel + """ + return cached_autotune( + None, + [triton.Config({}, num_stages=1, num_warps=num_warps)], + triton_meta=triton_meta, + inductor_meta=inductor_meta, + heuristic_type=HeuristicType.TEMPLATE, + filename=filename, + ) + + +def grid(*numels): + """Helper function to compute triton grids""" + if len(numels) == 1: + xnumel, ynumel, znumel = numels[0], None, None + elif len(numels) == 2: + xnumel, ynumel, znumel = numels[1], numels[0], None + elif len(numels) == 3: + xnumel, ynumel, znumel = numels[2], numels[1], numels[0] + else: + raise AssertionError(f"invalid size for numels {len(numels)}") + + def get_grid_dim(numel, block): + if numel is None: + return 1 + if block is None: + return numel + return ceildiv(numel, block) + + def grid_fn(meta): + x_grid = get_grid_dim(xnumel, meta.get("XBLOCK", 1)) + y_grid = get_grid_dim(ynumel, meta.get("YBLOCK", None)) + + max_y_grid = get_max_y_grid() + if znumel is None: + div = ceildiv(y_grid, max_y_grid) + y_grid = ceildiv(y_grid, div) + z_grid = div + else: + z_grid = get_grid_dim(znumel, meta.get("ZBLOCK", None)) + torch._check( + y_grid <= max_y_grid, + lambda: f"Generated y grid beyond 2^16 ({y_grid}) not supported with z dimension present. File issue", + ) + + return ( + x_grid, + y_grid, + z_grid, + ) + + setattr(grid_fn, "grid_fn_str", f"grid{numels}") # noqa: B010 + + return grid_fn + + +def split_scan_grid(xnumel, rnumel): + def grid_fn(meta): + assert meta.get("XBLOCK", 1) == 1 + return (ceildiv(rnumel, meta.get("RBLOCK", 1)), xnumel, 1) + + grid_fn_str = f"split_scan_grid({xnumel}, {rnumel})" + setattr(grid_fn, "grid_fn_str", grid_fn_str) # noqa: B010 + + return grid_fn + + +def grid_combo_kernels( + *numels, num_kernels, min_blocks, is_sequential, default_meta=None +): + """min_blocks is the minimal size of the grid x dimension""" + if not is_sequential: + # round robin dispatch + numels_agg = list(numels) + for i in range(len(numels_agg)): + if isinstance(numels_agg[i], (list, tuple)): + numels_agg[i] = max(max(numels_agg[i]), 0) # noqa: PLW3301 + kernel_grid_fn = grid(*numels_agg) + + if isinstance(numels[-1], (list, tuple)): + min_blocks_d = max(-min(numels[-1]), 0) * num_kernels + else: + min_blocks_d = None + if min_blocks is None: + assert min_blocks_d is not None + min_blocks = min_blocks_d + else: + assert ( + min_blocks_d is None or min_blocks == min_blocks_d + ), f"inconsistent min_blocks {min_blocks} vs x grid {numels[-1]}" + else: + # sequential dispatch + seq_numels = list(numels) + # x numels are not used here, just a place holder + seq_numels[-1] = 1024 + for i in range(len(seq_numels) - 1): + if isinstance(seq_numels[i], (list, tuple)): + seq_numels[i] = max(seq_numels[i]) + + kernel_grid_fn = grid(*seq_numels) + + def get_grid_dim(numel, block): + if numel is None: + return 1 + if block is None: + return numel + return ceildiv(numel, block) + + def grid_fn(meta): + assert min_blocks is not None, "min_blocks must be a number" + cuda_grid = list(kernel_grid_fn(meta)) + cuda_grid[0] = max(num_kernels * cuda_grid[0], min_blocks) + return tuple(cuda_grid) + + def seq_grid_fn(meta): + cuda_grid = list(kernel_grid_fn(meta)) + # x <= 0 means this kernel's x grid is not tunable (x_no_dim is true) + x_grid = sum( + [ + -x if x <= 0 else get_grid_dim(x, meta.get("XBLOCK", 1)) + for x in numels[-1] + ] + ) + cuda_grid[0] = x_grid + return tuple(cuda_grid) + + def grid_fn_default_meta(meta): + return grid_fn(default_meta) + + def seq_grid_fn_default_meta(meta): + return seq_grid_fn(default_meta) + + if default_meta is None: + return grid_fn if not is_sequential else seq_grid_fn + else: + return grid_fn_default_meta if not is_sequential else seq_grid_fn_default_meta diff --git a/.venv/lib/python3.11/site-packages/torch/include/dnnl_config.h b/.venv/lib/python3.11/site-packages/torch/include/dnnl_config.h new file mode 100644 index 0000000000000000000000000000000000000000..48925e1e3ab49ae135c6e9c4c501aa2f5e030913 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/dnnl_config.h @@ -0,0 +1,22 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef DNNL_CONFIG_H +#define DNNL_CONFIG_H + +#include "oneapi/dnnl/dnnl_config.h" + +#endif /* DNNL_CONFIG_H */ diff --git a/.venv/lib/python3.11/site-packages/torch/include/dnnl_debug.h b/.venv/lib/python3.11/site-packages/torch/include/dnnl_debug.h new file mode 100644 index 0000000000000000000000000000000000000000..5044971832bbbe56127920a527508b207a803eea --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/dnnl_debug.h @@ -0,0 +1,22 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef DNNL_DEBUG_H +#define DNNL_DEBUG_H + +#include "oneapi/dnnl/dnnl_debug.h" + +#endif /* DNNL_DEBUG_H */ diff --git a/.venv/lib/python3.11/site-packages/torch/include/dnnl_ocl.h b/.venv/lib/python3.11/site-packages/torch/include/dnnl_ocl.h new file mode 100644 index 0000000000000000000000000000000000000000..ad731150b28babe7bd5a911acd8de70c57e85254 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/dnnl_ocl.h @@ -0,0 +1,22 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef DNNL_OCL_H +#define DNNL_OCL_H + +#include "oneapi/dnnl/dnnl_ocl.h" + +#endif /* DNNL_OCL_H */ diff --git a/.venv/lib/python3.11/site-packages/torch/include/dnnl_sycl.h b/.venv/lib/python3.11/site-packages/torch/include/dnnl_sycl.h new file mode 100644 index 0000000000000000000000000000000000000000..4501598c2f461021f0fa818e95fd1972ce2d3ace --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/dnnl_sycl.h @@ -0,0 +1,22 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef DNNL_SYCL_H +#define DNNL_SYCL_H + +#include "oneapi/dnnl/dnnl_sycl.h" + +#endif /* DNNL_SYCL_H */ diff --git a/.venv/lib/python3.11/site-packages/torch/include/dnnl_threadpool.h b/.venv/lib/python3.11/site-packages/torch/include/dnnl_threadpool.h new file mode 100644 index 0000000000000000000000000000000000000000..e27e584a65ed16740d4fde93da3a1a049dd111aa --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/dnnl_threadpool.h @@ -0,0 +1,22 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef DNNL_THREADPOOL_H +#define DNNL_THREADPOOL_H + +#include "oneapi/dnnl/dnnl_threadpool.h" + +#endif /* DNNL_THREADPOOL_H */ diff --git a/.venv/lib/python3.11/site-packages/torch/include/dnnl_types.h b/.venv/lib/python3.11/site-packages/torch/include/dnnl_types.h new file mode 100644 index 0000000000000000000000000000000000000000..6f4261b712dc37ec2416ba60c0c68bb30f6995e0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/dnnl_types.h @@ -0,0 +1,22 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef DNNL_TYPES_H +#define DNNL_TYPES_H + +#include "oneapi/dnnl/dnnl_types.h" + +#endif /* DNNL_TYPES_H */ diff --git a/.venv/lib/python3.11/site-packages/torch/include/dnnl_version.h b/.venv/lib/python3.11/site-packages/torch/include/dnnl_version.h new file mode 100644 index 0000000000000000000000000000000000000000..32a3d5cf839b1d593f069520febfd60b323730e9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/dnnl_version.h @@ -0,0 +1,22 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef DNNL_VERSION_H +#define DNNL_VERSION_H + +#include "oneapi/dnnl/dnnl_version.h" + +#endif /* DNNL_VERSION_H */ diff --git a/.venv/lib/python3.11/site-packages/torch/include/experiments-config.h b/.venv/lib/python3.11/site-packages/torch/include/experiments-config.h new file mode 100644 index 0000000000000000000000000000000000000000..7c0cba4acdaef0784e7b96bfd6e755254d3eecb4 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/experiments-config.h @@ -0,0 +1,25 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct xnn_experiment_config { + bool adaptive_avx_optimization; +}; + +struct xnn_experiment_config* xnn_get_experiment_config(); + +void xnn_experiment_enable_adaptive_avx_optimization(); + + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/.venv/lib/python3.11/site-packages/torch/include/fp16.h b/.venv/lib/python3.11/site-packages/torch/include/fp16.h new file mode 100644 index 0000000000000000000000000000000000000000..9d7366e997dadef17922225bcbb489288f6f9cdc --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/fp16.h @@ -0,0 +1,11 @@ +#pragma once +#ifndef FP16_H +#define FP16_H + +#include + +#if defined(PSIMD_H) +#include +#endif + +#endif /* FP16_H */ diff --git a/.venv/lib/python3.11/site-packages/torch/include/fxdiv.h b/.venv/lib/python3.11/site-packages/torch/include/fxdiv.h new file mode 100644 index 0000000000000000000000000000000000000000..2c35038d97c55c524bb97caba2e3560cab9da504 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/fxdiv.h @@ -0,0 +1,425 @@ +#pragma once +#ifndef FXDIV_H +#define FXDIV_H + +#if defined(__cplusplus) && (__cplusplus >= 201103L) + #include + #include + #include +#elif !defined(__OPENCL_VERSION__) + #include + #include + #include +#endif + +#if defined(_MSC_VER) + #include + #if defined(_M_IX86) || defined(_M_X64) + #include + #endif +#endif + +#ifndef FXDIV_USE_INLINE_ASSEMBLY + #define FXDIV_USE_INLINE_ASSEMBLY 0 +#endif + +static inline uint64_t fxdiv_mulext_uint32_t(uint32_t a, uint32_t b) { +#if defined(_MSC_VER) && defined(_M_IX86) + return (uint64_t) __emulu((unsigned int) a, (unsigned int) b); +#else + return (uint64_t) a * (uint64_t) b; +#endif +} + +static inline uint32_t fxdiv_mulhi_uint32_t(uint32_t a, uint32_t b) { +#if defined(__OPENCL_VERSION__) + return mul_hi(a, b); +#elif defined(__CUDA_ARCH__) + return (uint32_t) __umulhi((unsigned int) a, (unsigned int) b); +#elif defined(_MSC_VER) && defined(_M_IX86) + return (uint32_t) (__emulu((unsigned int) a, (unsigned int) b) >> 32); +#elif defined(_MSC_VER) && defined(_M_ARM) + return (uint32_t) _MulUnsignedHigh((unsigned long) a, (unsigned long) b); +#else + return (uint32_t) (((uint64_t) a * (uint64_t) b) >> 32); +#endif +} + +static inline uint64_t fxdiv_mulhi_uint64_t(uint64_t a, uint64_t b) { +#if defined(__OPENCL_VERSION__) + return mul_hi(a, b); +#elif defined(__CUDA_ARCH__) + return (uint64_t) __umul64hi((unsigned long long) a, (unsigned long long) b); +#elif defined(_MSC_VER) && defined(_M_X64) + return (uint64_t) __umulh((unsigned __int64) a, (unsigned __int64) b); +#elif defined(__GNUC__) && defined(__SIZEOF_INT128__) + return (uint64_t) (((((unsigned __int128) a) * ((unsigned __int128) b))) >> 64); +#else + const uint32_t a_lo = (uint32_t) a; + const uint32_t a_hi = (uint32_t) (a >> 32); + const uint32_t b_lo = (uint32_t) b; + const uint32_t b_hi = (uint32_t) (b >> 32); + + const uint64_t t = fxdiv_mulext_uint32_t(a_hi, b_lo) + + (uint64_t) fxdiv_mulhi_uint32_t(a_lo, b_lo); + return fxdiv_mulext_uint32_t(a_hi, b_hi) + (t >> 32) + + ((fxdiv_mulext_uint32_t(a_lo, b_hi) + (uint64_t) (uint32_t) t) >> 32); +#endif +} + +static inline size_t fxdiv_mulhi_size_t(size_t a, size_t b) { +#if SIZE_MAX == UINT32_MAX + return (size_t) fxdiv_mulhi_uint32_t((uint32_t) a, (uint32_t) b); +#elif SIZE_MAX == UINT64_MAX + return (size_t) fxdiv_mulhi_uint64_t((uint64_t) a, (uint64_t) b); +#else + #error Unsupported platform +#endif +} + +struct fxdiv_divisor_uint32_t { + uint32_t value; + uint32_t m; + uint8_t s1; + uint8_t s2; +}; + +struct fxdiv_result_uint32_t { + uint32_t quotient; + uint32_t remainder; +}; + +struct fxdiv_divisor_uint64_t { + uint64_t value; + uint64_t m; + uint8_t s1; + uint8_t s2; +}; + +struct fxdiv_result_uint64_t { + uint64_t quotient; + uint64_t remainder; +}; + +struct fxdiv_divisor_size_t { + size_t value; + size_t m; + uint8_t s1; + uint8_t s2; +}; + +struct fxdiv_result_size_t { + size_t quotient; + size_t remainder; +}; + +static inline struct fxdiv_divisor_uint32_t fxdiv_init_uint32_t(uint32_t d) { + struct fxdiv_divisor_uint32_t result = { d }; + if (d == 1) { + result.m = UINT32_C(1); + result.s1 = 0; + result.s2 = 0; + } else { + #if defined(__OPENCL_VERSION__) + const uint32_t l_minus_1 = 31 - clz(d - 1); + #elif defined(__CUDA_ARCH__) + const uint32_t l_minus_1 = 31 - __clz((int) (d - 1)); + #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)) + unsigned long l_minus_1; + _BitScanReverse(&l_minus_1, (unsigned long) (d - 1)); + #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && FXDIV_USE_INLINE_ASSEMBLY + uint32_t l_minus_1; + __asm__("BSRL %[d_minus_1], %[l_minus_1]" + : [l_minus_1] "=r" (l_minus_1) + : [d_minus_1] "r" (d - 1) + : "cc"); + #elif defined(__GNUC__) + const uint32_t l_minus_1 = 31 - __builtin_clz(d - 1); + #else + /* Based on Algorithm 2 from Hacker's delight */ + + uint32_t l_minus_1 = 0; + uint32_t x = d - 1; + uint32_t y = x >> 16; + if (y != 0) { + l_minus_1 += 16; + x = y; + } + y = x >> 8; + if (y != 0) { + l_minus_1 += 8; + x = y; + } + y = x >> 4; + if (y != 0) { + l_minus_1 += 4; + x = y; + } + y = x >> 2; + if (y != 0) { + l_minus_1 += 2; + x = y; + } + if ((x & 2) != 0) { + l_minus_1 += 1; + } + #endif + uint32_t u_hi = (UINT32_C(2) << (uint32_t) l_minus_1) - d; + + /* Division of 64-bit number u_hi:UINT32_C(0) by 32-bit number d, 32-bit quotient output q */ + #if defined(__GNUC__) && defined(__i386__) && FXDIV_USE_INLINE_ASSEMBLY + uint32_t q; + __asm__("DIVL %[d]" + : "=a" (q), "+d" (u_hi) + : [d] "r" (d), "a" (0) + : "cc"); + #elif (defined(_MSC_VER) && _MSC_VER >= 1920) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (defined(_M_IX86) || defined(_M_X64)) + unsigned int remainder; + const uint32_t q = (uint32_t) _udiv64((unsigned __int64) ((uint64_t) u_hi << 32), (unsigned int) d, &remainder); + #else + const uint32_t q = ((uint64_t) u_hi << 32) / d; + #endif + + result.m = q + UINT32_C(1); + result.s1 = 1; + result.s2 = (uint8_t) l_minus_1; + } + return result; +} + +static inline struct fxdiv_divisor_uint64_t fxdiv_init_uint64_t(uint64_t d) { + struct fxdiv_divisor_uint64_t result = { d }; + if (d == 1) { + result.m = UINT64_C(1); + result.s1 = 0; + result.s2 = 0; + } else { + #if defined(__OPENCL_VERSION__) + const uint32_t nlz_d = clz(d); + const uint32_t l_minus_1 = 63 - clz(d - 1); + #elif defined(__CUDA_ARCH__) + const uint32_t nlz_d = __clzll((long long) d); + const uint32_t l_minus_1 = 63 - __clzll((long long) (d - 1)); + #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64)) + unsigned long l_minus_1; + _BitScanReverse64(&l_minus_1, (unsigned __int64) (d - 1)); + unsigned long bsr_d; + _BitScanReverse64(&bsr_d, (unsigned __int64) d); + const uint32_t nlz_d = bsr_d ^ 0x3F; + #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_ARM)) + const uint64_t d_minus_1 = d - 1; + const uint8_t d_is_power_of_2 = (d & d_minus_1) == 0; + unsigned long l_minus_1; + if ((uint32_t) (d_minus_1 >> 32) == 0) { + _BitScanReverse(&l_minus_1, (unsigned long) d_minus_1); + } else { + _BitScanReverse(&l_minus_1, (unsigned long) (uint32_t) (d_minus_1 >> 32)); + l_minus_1 += 32; + } + const uint32_t nlz_d = ((uint8_t) l_minus_1 ^ UINT8_C(0x3F)) - d_is_power_of_2; + #elif defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY + uint64_t l_minus_1; + __asm__("BSRQ %[d_minus_1], %[l_minus_1]" + : [l_minus_1] "=r" (l_minus_1) + : [d_minus_1] "r" (d - 1) + : "cc"); + #elif defined(__GNUC__) + const uint32_t l_minus_1 = 63 - __builtin_clzll(d - 1); + const uint32_t nlz_d = __builtin_clzll(d); + #else + /* Based on Algorithm 2 from Hacker's delight */ + const uint64_t d_minus_1 = d - 1; + const uint32_t d_is_power_of_2 = (d & d_minus_1) == 0; + uint32_t l_minus_1 = 0; + uint32_t x = (uint32_t) d_minus_1; + uint32_t y = d_minus_1 >> 32; + if (y != 0) { + l_minus_1 += 32; + x = y; + } + y = x >> 16; + if (y != 0) { + l_minus_1 += 16; + x = y; + } + y = x >> 8; + if (y != 0) { + l_minus_1 += 8; + x = y; + } + y = x >> 4; + if (y != 0) { + l_minus_1 += 4; + x = y; + } + y = x >> 2; + if (y != 0) { + l_minus_1 += 2; + x = y; + } + if ((x & 2) != 0) { + l_minus_1 += 1; + } + const uint32_t nlz_d = (l_minus_1 ^ UINT32_C(0x3F)) - d_is_power_of_2; + #endif + uint64_t u_hi = (UINT64_C(2) << (uint32_t) l_minus_1) - d; + + /* Division of 128-bit number u_hi:UINT64_C(0) by 64-bit number d, 64-bit quotient output q */ + #if defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY + uint64_t q; + __asm__("DIVQ %[d]" + : "=a" (q), "+d" (u_hi) + : [d] "r" (d), "a" (UINT64_C(0)) + : "cc"); + #elif 0 && defined(__GNUC__) && defined(__SIZEOF_INT128__) + /* GCC, Clang, and Intel Compiler fail to inline optimized implementation and call into support library for 128-bit division */ + const uint64_t q = (uint64_t) (((unsigned __int128) u_hi << 64) / ((unsigned __int128) d)); + #elif (defined(_MSC_VER) && _MSC_VER >= 1920) && !defined(__clang__) && !defined(__INTEL_COMPILER) && defined(_M_X64) + unsigned __int64 remainder; + const uint64_t q = (uint64_t) _udiv128((unsigned __int64) u_hi, 0, (unsigned __int64) d, &remainder); + #else + /* Implementation based on code from Hacker's delight */ + + /* Normalize divisor and shift divident left */ + d <<= nlz_d; + u_hi <<= nlz_d; + /* Break divisor up into two 32-bit digits */ + const uint64_t d_hi = (uint32_t) (d >> 32); + const uint32_t d_lo = (uint32_t) d; + + /* Compute the first quotient digit, q1 */ + uint64_t q1 = u_hi / d_hi; + uint64_t r1 = u_hi - q1 * d_hi; + + while ((q1 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q1, d_lo) > (r1 << 32)) { + q1 -= 1; + r1 += d_hi; + if ((r1 >> 32) != 0) { + break; + } + } + + /* Multiply and subtract. */ + u_hi = (u_hi << 32) - q1 * d; + + /* Compute the second quotient digit, q0 */ + uint64_t q0 = u_hi / d_hi; + uint64_t r0 = u_hi - q0 * d_hi; + + while ((q0 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q0, d_lo) > (r0 << 32)) { + q0 -= 1; + r0 += d_hi; + if ((r0 >> 32) != 0) { + break; + } + } + const uint64_t q = (q1 << 32) | (uint32_t) q0; + #endif + result.m = q + UINT64_C(1); + result.s1 = 1; + result.s2 = (uint8_t) l_minus_1; + } + return result; +} + +static inline struct fxdiv_divisor_size_t fxdiv_init_size_t(size_t d) { +#if SIZE_MAX == UINT32_MAX + const struct fxdiv_divisor_uint32_t uint_result = fxdiv_init_uint32_t((uint32_t) d); +#elif SIZE_MAX == UINT64_MAX + const struct fxdiv_divisor_uint64_t uint_result = fxdiv_init_uint64_t((uint64_t) d); +#else + #error Unsupported platform +#endif + struct fxdiv_divisor_size_t size_result = { + (size_t) uint_result.value, + (size_t) uint_result.m, + uint_result.s1, + uint_result.s2 + }; + return size_result; +} + +static inline uint32_t fxdiv_quotient_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) { + const uint32_t t = fxdiv_mulhi_uint32_t(n, divisor.m); + return (t + ((n - t) >> divisor.s1)) >> divisor.s2; +} + +static inline uint64_t fxdiv_quotient_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) { + const uint64_t t = fxdiv_mulhi_uint64_t(n, divisor.m); + return (t + ((n - t) >> divisor.s1)) >> divisor.s2; +} + +static inline size_t fxdiv_quotient_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) { +#if SIZE_MAX == UINT32_MAX + const struct fxdiv_divisor_uint32_t uint32_divisor = { + (uint32_t) divisor.value, + (uint32_t) divisor.m, + divisor.s1, + divisor.s2 + }; + return fxdiv_quotient_uint32_t((uint32_t) n, uint32_divisor); +#elif SIZE_MAX == UINT64_MAX + const struct fxdiv_divisor_uint64_t uint64_divisor = { + (uint64_t) divisor.value, + (uint64_t) divisor.m, + divisor.s1, + divisor.s2 + }; + return fxdiv_quotient_uint64_t((uint64_t) n, uint64_divisor); +#else + #error Unsupported platform +#endif +} + +static inline uint32_t fxdiv_remainder_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) { + const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor); + return n - quotient * divisor.value; +} + +static inline uint64_t fxdiv_remainder_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) { + const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor); + return n - quotient * divisor.value; +} + +static inline size_t fxdiv_remainder_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) { + const size_t quotient = fxdiv_quotient_size_t(n, divisor); + return n - quotient * divisor.value; +} + +static inline uint32_t fxdiv_round_down_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t granularity) { + const uint32_t quotient = fxdiv_quotient_uint32_t(n, granularity); + return quotient * granularity.value; +} + +static inline uint64_t fxdiv_round_down_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t granularity) { + const uint64_t quotient = fxdiv_quotient_uint64_t(n, granularity); + return quotient * granularity.value; +} + +static inline size_t fxdiv_round_down_size_t(size_t n, const struct fxdiv_divisor_size_t granularity) { + const size_t quotient = fxdiv_quotient_size_t(n, granularity); + return quotient * granularity.value; +} + +static inline struct fxdiv_result_uint32_t fxdiv_divide_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) { + const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor); + const uint32_t remainder = n - quotient * divisor.value; + struct fxdiv_result_uint32_t result = { quotient, remainder }; + return result; +} + +static inline struct fxdiv_result_uint64_t fxdiv_divide_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) { + const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor); + const uint64_t remainder = n - quotient * divisor.value; + struct fxdiv_result_uint64_t result = { quotient, remainder }; + return result; +} + +static inline struct fxdiv_result_size_t fxdiv_divide_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) { + const size_t quotient = fxdiv_quotient_size_t(n, divisor); + const size_t remainder = n - quotient * divisor.value; + struct fxdiv_result_size_t result = { quotient, remainder }; + return result; +} + +#endif /* FXDIV_H */ diff --git a/.venv/lib/python3.11/site-packages/torch/include/libshm.h b/.venv/lib/python3.11/site-packages/torch/include/libshm.h new file mode 100644 index 0000000000000000000000000000000000000000..28024aa2338d1f46ce280abeb92a633f89be1385 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/libshm.h @@ -0,0 +1,46 @@ +#pragma once + +#include + +#ifdef __cplusplus + +void libshm_init(const char* manager_exec_path); + +// Superclass to run a constructor before at::RefcountedMapAllocator +class THManagedMapAllocatorInit { + protected: + THManagedMapAllocatorInit(const char* manager_handle, const char* filename); + std::string manager_handle_; +}; + +// Like a at::RefcountedMapAllocator, but it also makes use of an external +// shared memory manager process to ensure that shared memory regions actually +// get freed in the end (even if processes lose the memory). +class THManagedMapAllocator : private THManagedMapAllocatorInit, + public at::RefcountedMapAllocator { + public: + THManagedMapAllocator( + const char* manager_handle, + const char* filename, + int flags, + size_t size); + + void close() override; + + ~THManagedMapAllocator() override { + close(); + } + + static at::DataPtr makeDataPtr( + const char* manager_handle, + const char* filename, + int flags, + size_t size); + static THManagedMapAllocator* fromDataPtr(const at::DataPtr&); + + const char* manager_handle() const { + return manager_handle_.c_str(); + } +}; + +#endif diff --git a/.venv/lib/python3.11/site-packages/torch/include/nnpack.h b/.venv/lib/python3.11/site-packages/torch/include/nnpack.h new file mode 100644 index 0000000000000000000000000000000000000000..97b5ff390076e9ab7ae91e67bfc0d78736aaeffd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/nnpack.h @@ -0,0 +1,659 @@ +#pragma once + +#include +#include +#include + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Status code for any NNPACK function call. + */ +enum nnp_status { + /** The call succeeded, and all output arguments now contain valid data. */ + nnp_status_success = 0, + /** NNPACK function was called with batch_size == 0. */ + nnp_status_invalid_batch_size = 2, + /** NNPACK function was called with channels == 0. */ + nnp_status_invalid_channels = 3, + /** NNPACK function was called with input_channels == 0. */ + nnp_status_invalid_input_channels = 4, + /** NNPACK function was called with output_channels == 0. */ + nnp_status_invalid_output_channels = 5, + /** NNPACK function was called with input_size.height == 0 or input_size.width == 0 */ + nnp_status_invalid_input_size = 10, + /** NNPACK function was called with input_stride.height == 0 or input_stride.width == 0 */ + nnp_status_invalid_input_stride = 11, + /** NNPACK function was called with input_padding not less than respective kernel (or pooling) size, i.e.: + * + * - input_padding.left >= kernel_size.width (>= pooling_size.width) + * - input_padding.right >= kernel_size.width (>= pooling_size.width) + * - input_padding.top >= kernel_size.height (>= pooling_size.height) + * - input_padding.bottom >= kernel_size.height (>= pooling_size.height) + */ + nnp_status_invalid_input_padding = 12, + /** NNPACK function was called with kernel_size.height == 0 or kernel_size.width == 0 */ + nnp_status_invalid_kernel_size = 13, + /** NNPACK function was called with pooling_size.height == 0 or pooling_size.width == 0 */ + nnp_status_invalid_pooling_size = 14, + /** NNPACK function was called with pooling_stride.height == 0 or pooling_stride.width == 0 */ + nnp_status_invalid_pooling_stride = 15, + /** NNPACK function was called with convolution algorithm not in nnp_convolution_algorithm enumeration */ + nnp_status_invalid_algorithm = 16, + /** NNPACK function was called with convolution transform strategy not in nnp_convolution_transform_strategy enum */ + nnp_status_invalid_transform_strategy = 17, + /** NNPACK function was called with output_subsampling.height == 0 or output_subsampling.width == 0 */ + nnp_status_invalid_output_subsampling = 13, + /** NNPACK function was called with activation not in nnp_activation enum */ + nnp_status_invalid_activation = 14, + /** NNPACK function was called with invalid activation parameters */ + nnp_status_invalid_activation_parameters = 15, + + /** NNPACK does not support the particular input size for the function */ + nnp_status_unsupported_input_size = 20, + /** NNPACK does not support the particular input stride for the function */ + nnp_status_unsupported_input_stride = 21, + /** NNPACK does not support the particular input padding for the function */ + nnp_status_unsupported_input_padding = 22, + /** NNPACK does not support the particular kernel size for the function */ + nnp_status_unsupported_kernel_size = 23, + /** NNPACK does not support the particular pooling size for the function */ + nnp_status_unsupported_pooling_size = 24, + /** NNPACK does not support the particular pooling stride for the function */ + nnp_status_unsupported_pooling_stride = 25, + /** NNPACK does not support the particular convolution algorithm for the function */ + nnp_status_unsupported_algorithm = 26, + /** NNPACK does not support the particular convolution transform strategy for the algorithm */ + nnp_status_unsupported_transform_strategy = 27, + /** NNPACK does not support the particular activation function for the function */ + nnp_status_unsupported_activation = 28, + /** NNPACK does not support the particular activation function parameters for the function */ + nnp_status_unsupported_activation_parameters = 29, + + /** NNPACK function was called before the library was initialized */ + nnp_status_uninitialized = 50, + /** NNPACK does not implement this function for the host CPU */ + nnp_status_unsupported_hardware = 51, + /** NNPACK failed to allocate memory for temporary buffers */ + nnp_status_out_of_memory = 52, + /** Scratch space buffer is too small */ + nnp_status_insufficient_buffer = 53, + /** Scratch space buffer is not properly aligned */ + nnp_status_misaligned_buffer = 54 +}; + +/** + * @brief Activation applied applied after a convolutional or fully-connected layer. + */ +enum nnp_activation { + /** Identity activation f(x) := x, i.e. no transformation */ + nnp_activation_identity = 0, + /** ReLU activation f(x) := max(0, x) */ + nnp_activation_relu = 1, +}; + +/** + * @brief Algorithm for computing convolutional layers. + */ +enum nnp_convolution_algorithm { + /** Let NNPACK choose the algorithm depending on layer parameters */ + nnp_convolution_algorithm_auto = 0, + /** Tiled convolution based on 2D Fourier transform with 8x8 blocks. Supports kernels up to 8x8. */ + nnp_convolution_algorithm_ft8x8 = 1, + /** Tiled convolution based on 2D Fourier transform with 16x16 blocks. Supports kernels up to 16x16. */ + nnp_convolution_algorithm_ft16x16 = 2, + /** Tiled convolution based on 2D Winograd transform F(3x3, 6x6) with 8x8 blocks. Supports only 3x3 kernels. */ + nnp_convolution_algorithm_wt8x8 = 3, + /** Direct convolution via implicit GEMM. */ + nnp_convolution_algorithm_implicit_gemm = 4, + /** Direct convolution implementation. */ + nnp_convolution_algorithm_direct = 5, + /** + * Tiled convolution based on 2D Winograd transform F(3x3, 6x6) with 8x8 blocks in FP16. + * Supports only 3x3 kernels. Implemented only for new ARM processors (with NEON-HP), + * on non-supported processors falls back to nnp_convolution_algorithm_wt8x8. + */ + nnp_convolution_algorithm_wt8x8_fp16 = 6, +}; + +enum nnp_convolution_transform_strategy { + nnp_convolution_transform_strategy_compute = 1, + nnp_convolution_transform_strategy_precompute = 2, + nnp_convolution_transform_strategy_reuse = 3 +}; + +/* For backward compatibility */ +#define nnp_convolution_transform_strategy_block_based nnp_convolution_transform_strategy_compute +#define nnp_convolution_transform_strategy_tuple_based nnp_convolution_transform_strategy_compute + +/** + * @brief Size of images, kernels, and pooling filters in NNPACK. + */ +struct nnp_size { + /** Width (horizontal size) of an image, kernel, or pooling filter. */ + size_t width; + /** Height (vertical size) of an image, kernel, or pooling filter. */ + size_t height; +}; + +/** + * @brief Padding of images in NNPACK. + */ +struct nnp_padding { + /** Padding above the image data */ + size_t top; + /** Padding on the right of image data */ + size_t right; + /** Padding below the image data */ + size_t bottom; + /** Padding on the left of image data */ + size_t left; +}; + +/** + * @brief Profiling information about time spent in different phases of a function call. + */ +struct nnp_profile { + /** Time spent inside the function call, in seconds. */ + double total; + /** Time spend on transformation of the input or input gradient tensor, in seconds. */ + double input_transform; + /** Time spend on transformation of the kernel or kernel gradient tensor, in seconds. */ + double kernel_transform; + /** Time spend on transformation of the output or output gradient tensor, in seconds. */ + double output_transform; + /** Time spend on multiplication-accumulation of transformed coefficients, in seconds. */ + double block_multiplication; +}; + +enum nnp_status nnp_initialize(void); + +enum nnp_status nnp_deinitialize(void); + +/** + * @brief Computes output of a 2D convolutional layer from input and kernel tensors. + * @details This function targets training of convolutional neural networks and performs forward propagation. + * It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch. + * For minibatch size 1, use nnp_convolution_inference for optimal performance. + * @param algorithm The type of algorithm to use for convolution. Possible values are: + * + * - nnp_convolution_algorithm_auto -- let the function choose the algorithm. + * - nnp_convolution_algorithm_ft8x8 -- tiled convolution based on 2D Fourier transform with 8x8 blocks. + * Supports kernels up to 8x8. + * - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks. + * Supports kernels up to 16x16. + * - nnp_convolution_algorithm_wt8x8 -- tiled convolution based on 2D Winograd transform F(3x3, 6x6). + * Supports only 3x3 kernels. + * + * @param batch_size The number of images on the input and output of the convolutional layer. + * @param input_channels The number of channels (AKA features, dimensions) in the input images. + * @param output_channels The number of channels (AKA features, dimensions) in the output images. + * @param input_size Size of input images, excluding implicit zero-padding. + * @param input_padding Implicit zero-padding of input images. + * @param kernel_size Kernel size. + * @param[in] input A 4D tensor input[batch_size][input_channels][input_size.height][input_size.width]. + * @param[in] kernel A 4D tensor kernel[output_channels][input_channels][kernel_size.height][kernel_size.width]. + * @param[in] bias A 1D array bias[output_channels]. + * @param[out] output A 4D tensor output[batch_size][output_channels][output_size.height][output_size.width] where + * output_size.height = (input_padding.top + input_size.height + input_padding.bottom) - + * (kernel_size.height - 1) + * output_size.width = (input_padding.left + input_size.width + input_padding.right) - + * (kernel_size.width - 1) + * @param threadpool A thread pool for parallelization of the computation. + * If threadpool is NULL, the computation would run on the caller thread without parallelization. + * @param[out] profile An optional pointer to profiling structure. + * If provided, the structure would record time spent in different phases of the computation. + */ + +enum nnp_status nnp_convolution_output( + enum nnp_convolution_algorithm algorithm, + size_t batch_size, + size_t input_channels, + size_t output_channels, + struct nnp_size input_size, + struct nnp_padding input_padding, + struct nnp_size kernel_size, + const float* input, + const float* kernel, + const float* bias, + float* output, + void* workspace_buffer, + size_t* workspace_size, + enum nnp_activation activation, + const void* activation_parameters, + pthreadpool_t threadpool, + struct nnp_profile* profile); + +/** + * @brief Computes gradient of input of a 2D convolutional layer from gradient of output and kernel tensors. + * @details This function targets training of convolutional neural networks and performs backward propagation. + * It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch. + * @param algorithm The type of algorithm to use for convolution. Possible values are: + * + * - nnp_convolution_algorithm_auto -- let the function choose the algorithm. + * - nnp_convolution_algorithm_ft8x8 -- tiled convolution based on 2D Fourier transform with 8x8 blocks. + * Supports kernels up to 8x8. + * - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks. + * Supports kernels up to 16x16. + * - nnp_convolution_algorithm_wt8x8 -- tiled convolution based on 2D Winograd transform F(3x3, 6x6). + * Supports only 3x3 kernels. + * + * @param batch_size The number of images (and their gradients) on the input and output of the convolutional layer. + * @param input_channels The number of channels (AKA features, dimensions) in the input images (and gradients). + * @param output_channels The number of channels (AKA features, dimensions) in the output images (and gradients). + * @param input_size Size of input images and their gradients, excluding implicit zero-padding. + * @param input_padding Implicit zero-padding of input images. + * @param kernel_size Kernel size. + * @param[in] grad_output A 4D tensor grad_output[batch_size][output_channels][output_size.height][output_size.width] + * where + * output_size.height = (input_padding.top + input_size.height + input_padding.bottom) - + * (kernel_size.height - 1) + * output_size.width = (input_padding.left + input_size.width + input_padding.right) - + * (kernel_size.width - 1) + * @param[in] kernel A 4D tensor kernel[output_channels][input_channels][kernel_size.height][kernel_size.width]. + * @param[out] grad_input A 4D tensor grad_input[batch_size][input_channels][input_size.height][input_size.width]. + * @param threadpool A thread pool for parallelization of the computation. + * If threadpool is NULL, the computation would run on the caller thread without parallelization. + * @param[out] profile An optional pointer to profiling structure. + * If provided, the structure would record time spent in different phases of the computation. + */ +enum nnp_status nnp_convolution_input_gradient( + enum nnp_convolution_algorithm algorithm, + size_t batch_size, + size_t input_channels, + size_t output_channels, + struct nnp_size input_size, + struct nnp_padding input_padding, + struct nnp_size kernel_size, + const float* grad_output, + const float* kernel, + float* grad_input, + void* workspace_buffer, + size_t* workspace_size, + enum nnp_activation activation, + const void* activation_parameters, + pthreadpool_t threadpool, + struct nnp_profile* profile); + +/** + * @brief Computes gradient of kernel of a 2D convolutional layer from gradient of output and input tensors. + * @details This function targets training of convolutional neural networks and performs backward propagation. + * It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch. + * @param algorithm The type of algorithm to use for convolution. Possible values are: + * + * - nnp_convolution_algorithm_auto -- let the function choose the algorithm. + * - nnp_convolution_algorithm_ft8x8 -- tiled convolution based on 2D Fourier transform with 8x8 blocks. + * Supports kernels up to 8x8. + * - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks. + * Supports kernels up to 16x16. + * + * @param batch_size The number of images (and their gradients) on the input and output of the convolutional layer. + * @param input_channels The number of channels (AKA features, dimensions) in the input images. + * @param output_channels The number of channels (AKA features, dimensions) in the output images (and gradients). + * @param input_size Size of input images and their gradients, excluding implicit zero-padding. + * @param input_padding Implicit zero-padding of input images. + * @param kernel_size Kernel size. + * @param[in] input A 4D tensor input[batch_size][input_channels][input_size.height][input_size.width]. + * @param[in] grad_output A 4D tensor grad_output[batch_size][output_channels][output_size.height][output_size.width] + * where + * output_size.height = (input_padding.top + input_size.height + input_padding.bottom) - + * (kernel_size.height - 1) + * output_size.width = (input_padding.left + input_size.width + input_padding.right) - + * (kernel_size.width - 1) + * @param[out] grad_kernel A 4D tensor + * grad_kernel[output_channels][input_channels][kernel_size.height][kernel_size.width]. + * @param threadpool A thread pool for parallelization of the computation. + * If threadpool is NULL, the computation would run on the caller thread without parallelization. + * @param[out] profile An optional pointer to profiling structure. + * If provided, the structure would record time spent in different phases of the computation. + */ +enum nnp_status nnp_convolution_kernel_gradient( + enum nnp_convolution_algorithm algorithm, + size_t batch_size, + size_t input_channels, + size_t output_channels, + struct nnp_size input_size, + struct nnp_padding input_padding, + struct nnp_size kernel_size, + const float* input, + const float* grad_output, + float* grad_kernel, + void* workspace_buffer, + size_t* workspace_size, + enum nnp_activation activation, + const void* activation_parameters, + pthreadpool_t threadpool, + struct nnp_profile* profile); + +/** + * @brief Computes output of a 2D convolutional layer for a single input image and a kernel tensor. + * @details This function targets prediction with convolutional neural networks and performs forward propagation. + * @param algorithm The type of algorithm to use for convolution. Possible values are: + * + * - nnp_convolution_algorithm_auto -- let the function choose the algorithm. + * - nnp_convolution_algorithm_ft8x8 -- tiled convolution based on 2D Fourier transform with 8x8 blocks. + * Supports kernels up to 8x8. + * - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks. + * Supports kernels up to 16x16. + * - nnp_convolution_algorithm_wt8x8 -- tiled convolution based on 2D Winograd transform F(3x3, 6x6). + * Supports only 3x3 kernels. + * + * @param transform_strategy A strategy that guides computation of kernel transforms coefficients. + * Possible values are: + * + * - nnp_convolution_transform_strategy_block_based -- do multiplication-accumulations on blocks of transformed + * coefficients. + * - nnp_convolution_transform_strategy_tuple_based -- do multiplication-accumulations on tuples of transformed + * coefficients. + * + * @param input_channels The number of channels (AKA features, dimensions) in the input image. + * @param output_channels The number of channels (AKA features, dimensions) in the output image. + * @param input_size Size of input image, excluding implicit zero-padding. + * @param input_padding Implicit zero-padding of input image. + * @param kernel_size Kernel size. + * @param output_subsampling Subsample region for output, also known as convolution stride. + * @param[in] input A 3D tensor input[input_channels][input_size.height][input_size.width]. + * @param[in] kernel A 4D tensor kernel[output_channels][input_channels][kernel_size.height][kernel_size.width]. + * @param[in] bias A 1D array bias[output_channels]. + * @param[out] output A 3D tensor output[output_channels][output_size.height][output_size.width] where + * output_size.height = (input_padding.top + input_size.height + input_padding.bottom) - + * (kernel_size.height - 1) + * output_size.width = (input_padding.left + input_size.width + input_padding.right) - + * (kernel_size.width - 1) + * @param[in] workspace_buffer Buffer for scratch memory used during computation. Buffer must be aligned on 64 bytes. + * If workspace_buffer is NULL and workspace_size is non-NULL, NNPACK would store the size + * of required workspace memory at the workspace_size location, and exit without + * computations. + * If workspace_buffer is NULL and workspace_size is NULL, NNPACK would allocate memory + * before and deallocate after this computation, potentially at significant runtime cost. + * @param[in,out] workspace_size Pointer to the size of workspace buffer. + * If workspace_buffer is NULL, NNPACK will write the size of required scratch memory to + * the location specified by this pointer. + * If workspace_buffer is non-NULL, NNPACK expects workspace_size to specify the size of + * the buffer, in bytes. + * If workspace_size is NULL, workspace_buffer must be NULL as well. In this case NNPACK + * would allocate memory before and deallocate after this computation, potentially at + * significant runtime cost. + * @param threadpool A thread pool for parallelization of the computation. + * If threadpool is NULL, the computation would run on the caller thread without parallelization. + * @param[out] profile An optional pointer to profiling structure. + * If provided, the structure would record time spent in different phases of the computation. + */ +enum nnp_status nnp_convolution_inference( + enum nnp_convolution_algorithm algorithm, + enum nnp_convolution_transform_strategy transform_strategy, + size_t input_channels, + size_t output_channels, + struct nnp_size input_size, + struct nnp_padding input_padding, + struct nnp_size kernel_size, + struct nnp_size output_subsampling, + const float* input, + const float* kernel, + const float* bias, + float* output, + void* workspace_buffer, + size_t* workspace_size, + enum nnp_activation activation, + const void* activation_parameters, + pthreadpool_t threadpool, + struct nnp_profile* profile); + +/** + * @brief Computes output of a fully connected layer from input and kernel matrices. + * @details This function targets training of convolutional neural networks and performs forward propagation. + * It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch. + * For minibatch size 1, use nnp_fully_connected_inference for optimal performance. + * @param batch_size The number of vectors on the input and output of the fully connected layer. + * @param input_channels The number of channels (AKA features, dimensions) in the input matrix. + * @param output_channels The number of channels (AKA features, dimensions) in the output matrix. + * @param[in] input A 2D matrix input[batch_size][input_channels]. + * @param[in] kernel A 2D matrix kernel[output_channels][input_channels]. + * @param[out] output A 2D matrix output[batch_size][output_channels]. + * @param threadpool A thread pool for parallelization of the computation. + * If threadpool is NULL, the computation would run on the caller thread without parallelization. + */ +enum nnp_status nnp_fully_connected_output( + size_t batch_size, + size_t input_channels, + size_t output_channels, + const float input[], + const float kernel[], + float output[], + pthreadpool_t threadpool, + struct nnp_profile* profile); + +/** + * @brief Computes output of a fully connected layer for a single input vector and a kernel matrix. + * @details This function targets prediction with convolutional neural networks and performs forward propagation. + * @param input_channels The number of channels (AKA features, dimensions) in the input vector. + * @param output_channels The number of channels (AKA features, dimensions) in the output vector. + * @param[in] input A 1D array input[input_channels] of FP32 elements. + * @param[in] kernel A 2D matrix kernel[output_channels][input_channels] of FP32 elements. + * @param[out] output A 1D array output[output_channels] of FP32 elements. + * @param threadpool A thread pool for parallelization of the computation. + * If threadpool is NULL, the computation would run on the caller thread without parallelization. + */ +enum nnp_status nnp_fully_connected_inference( + size_t input_channels, + size_t output_channels, + const float* input, + const float* kernel, + float* output, + pthreadpool_t threadpool); + +/** + * @brief Computes output of a fully connected layer for a single input vector and a kernel matrix. + * @details This function targets prediction with convolutional neural networks and performs forward propagation. + * @param input_channels The number of channels (AKA features, dimensions) in the input vector. + * @param output_channels The number of channels (AKA features, dimensions) in the output vector. + * @param[in] input A 1D array input[input_channels] of FP32 elements. + * @param[in] kernel A 2D matrix kernel[output_channels][input_channels] of FP16 (ARM alternative format) elements. + * @param[out] output A 1D array output[output_channels] of FP32 elements. + * @param threadpool A thread pool for parallelization of the computation. + * If threadpool is NULL, the computation would run on the caller thread without parallelization. + */ +enum nnp_status nnp_fully_connected_inference_f16f32( + size_t input_channels, + size_t output_channels, + const float* input, + const void* kernel, + float* output, + pthreadpool_t threadpool); + +/** + * @brief Computes output of a max-pooling layer for an input tensor. + * @details This function targets both prediction and training of convolutional neural networks and performs forward + * propagation. Is is optimized for both large and small minibatch sizes. + * @param batch_size The number of images on the input and output of the max-pooling layer. + * @param channels The number of channels (AKA features, dimensions) in both input and output images. + * @param input_size Size of input images, excluding implicit zero-padding. + * @param input_padding Implicit padding of input images. The padding pixels are ignored by the pooling filter, but + * affect the output size. + * @param pooling_size Size of the pooling filter. Only 2x2 filter are currently supported. + * @param pooling_stride Stride of the pooling filter. Only 2x2 strides are currently supported. + * @param[in] input A 4D tensor input[batch_size][channels][input_size.height][input_size.width]. + * @param[out] output A 4D tensor output[batch_size][channels][output_size.height][output_size.width] where + * output_size.height = ceil( + * (input_padding.top + input_size.height + input_padding.bottom - pooling_size.height) / + * pooling_stride.height) + 1 + * output_size.width = ceil( + * (input_padding.left + input_size.width + input_padding.right - pooling_size.width) / + * pooling_stride.width) + 1 + * @param threadpool A thread pool for parallelization of the computation. + * If threadpool is NULL, the computation would run on the caller thread without parallelization. + */ +enum nnp_status nnp_max_pooling_output( + size_t batch_size, + size_t channels, + struct nnp_size input_size, + struct nnp_padding input_padding, + struct nnp_size pooling_size, + struct nnp_size pooling_stride, + const float input[], + float output[], + pthreadpool_t threadpool); + +/** + * @brief Computes output of a softmax layer for an input matrix. + * @details This function targets both prediction and training of convolutional neural networks and performs forward + * propagation. Is is optimized for both large and small minibatch sizes. + * @param batch_size The number of vectors on the input and output of the softmax layer. + * @param channels The number of channels (AKA features, dimensions) in both input and output vectors. + * @param[in] input A 2D matrix input[batch_size][channels]. + * @param[out] output A 2D matrix output[batch_size][channels]. + * @param threadpool A thread pool for parallelization of the computation. + * If threadpool is NULL, the computation would run on the caller thread without parallelization. + */ +enum nnp_status nnp_softmax_output( + size_t batch_size, + size_t channels, + const float input[], + float output[], + pthreadpool_t threadpool); + +/** + * @brief Computes output of a rectified linear unit (ReLU) layer for an input matrix. + * @details This function targets both prediction and training of convolutional neural networks and performs forward + * propagation. Is is optimized for both large and small minibatch sizes. + * @param batch_size The number of vectors on the input and output of the ReLU layer. + * @param channels The number of channels (AKA features, dimensions) in both input and output matrices. + * @param[in] input A 2D matrix input[batch_size][channels]. + * @param[out] output A 2D matrix output[batch_size][channels]. + * @param threadpool A thread pool for parallelization of the computation. + * If threadpool is NULL, the computation would run on the caller thread without parallelization. + */ +enum nnp_status nnp_relu_output( + size_t batch_size, + size_t channels, + const float input[], + float output[], + float negative_slope, + pthreadpool_t threadpool); + +/** + * @brief Computes gradient of input of a rectified linear unit (ReLU) layer from gradient of output and input matrices. + * @details This function targets training of convolutional neural networks and performs backward propagation. + * Is is optimized for both large and small minibatch sizes. + * @param batch_size The number of vectors on the input and output of the ReLU layer. + * @param channels The number of channels (AKA features, dimensions) in both input and output matrices. + * @param[in] input A 2D matrix input[batch_size][channels]. + * @param[out] output A 2D matrix output[batch_size][channels]. + * @param threadpool A thread pool for parallelization of the computation. + * If threadpool is NULL, the computation would run on the caller thread without parallelization. + */ +enum nnp_status nnp_relu_input_gradient( + size_t batch_size, + size_t channels, + const float grad_output[], + const float input[], + float grad_input[], + float negative_slope, + pthreadpool_t threadpool); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#ifdef __cplusplus +// Backward compatible implementations for nnp_convolution_*, if we are in C++ +// mode. +inline enum nnp_status nnp_convolution_output( + enum nnp_convolution_algorithm algorithm, + size_t batch_size, + size_t input_channels, + size_t output_channels, + struct nnp_size input_size, + struct nnp_padding input_padding, + struct nnp_size kernel_size, + const float input[], + const float kernel[], + const float bias[], + float output[], + pthreadpool_t threadpool, + struct nnp_profile* profile) +{ + return nnp_convolution_output( + algorithm, + batch_size, input_channels, output_channels, + input_size, input_padding, kernel_size, + input, kernel, bias, output, + NULL, NULL, + nnp_activation_identity, NULL, threadpool, profile); +} + +inline enum nnp_status nnp_convolution_input_gradient( + enum nnp_convolution_algorithm algorithm, + size_t batch_size, + size_t input_channels, + size_t output_channels, + struct nnp_size input_size, + struct nnp_padding input_padding, + struct nnp_size kernel_size, + const float grad_output[], + const float kernel[], + float grad_input[], + pthreadpool_t threadpool, + struct nnp_profile* profile) +{ + return nnp_convolution_input_gradient( + algorithm, + batch_size, input_channels, output_channels, + input_size, input_padding, kernel_size, + grad_output, kernel, grad_input, + NULL, NULL, + nnp_activation_identity, NULL, threadpool, profile); +} + +inline enum nnp_status nnp_convolution_kernel_gradient( + enum nnp_convolution_algorithm algorithm, + size_t batch_size, + size_t input_channels, + size_t output_channels, + struct nnp_size input_size, + struct nnp_padding input_padding, + struct nnp_size kernel_size, + const float input[], + const float grad_output[], + float grad_kernel[], + pthreadpool_t threadpool, + struct nnp_profile* profile) +{ + return nnp_convolution_kernel_gradient( + algorithm, + batch_size, input_channels, output_channels, + input_size, input_padding, kernel_size, + input, grad_output, grad_kernel, + NULL, NULL, + nnp_activation_identity, NULL, threadpool, profile); +} + +inline enum nnp_status nnp_convolution_inference( + enum nnp_convolution_algorithm algorithm, + enum nnp_convolution_transform_strategy transform_strategy, + size_t input_channels, + size_t output_channels, + struct nnp_size input_size, + struct nnp_padding input_padding, + struct nnp_size kernel_size, + struct nnp_size output_subsampling, + const float input[], + const float kernel[], + const float bias[], + float output[], + pthreadpool_t threadpool, + struct nnp_profile* profile) { + return nnp_convolution_inference( + algorithm, transform_strategy, + input_channels, output_channels, + input_size, input_padding, kernel_size, output_subsampling, + input, kernel, bias, output, NULL, NULL, + nnp_activation_identity, NULL, + threadpool, profile); +} + +#endif // __cplusplus diff --git a/.venv/lib/python3.11/site-packages/torch/include/qnnpack_func.h b/.venv/lib/python3.11/site-packages/torch/include/qnnpack_func.h new file mode 100644 index 0000000000000000000000000000000000000000..10bbc000192d7e03745e2cf3fb263a9655cde00c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/qnnpack_func.h @@ -0,0 +1,166 @@ +#pragma once + +#include +#include + +namespace qnnpack { +class PrePackConvWeights final { + public: + PrePackConvWeights( + const pytorch_qnnp_operator_t convolution, + const uint8_t* kernel_zero_points, + const uint8_t* kernel, + const int32_t* bias); + + void* getPackedWeights() const + { + return packed_weights_; + } + + int64_t getOutputChannels() const + { + return output_channels_; + } + + ~PrePackConvWeights() + { + if (packed_weights_ != nullptr) { + free(packed_weights_); + } + } + + PrePackConvWeights() = delete; + PrePackConvWeights(const PrePackConvWeights&) = delete; + PrePackConvWeights& operator=(const PrePackConvWeights&) = delete; + + private: + void* packed_weights_ = nullptr; + int64_t output_channels_; +}; + +class PackBMatrix final { + public: + PackBMatrix( + size_t input_channels, + size_t output_channels, + const uint8_t* kernel_zero_points, + const float* requantization_scale, + const uint8_t* kernel, + const int32_t* bias); + + // This constructor is to be used for dynamic mode + // quantization. In dynamic mode, we dont yet support + // per channel quantization, and paying the cost of + // memory allocation for per channel zero point and + // requant scale will hurt performance. + PackBMatrix( + size_t input_channels, + size_t output_channels, + const uint8_t kernel_zero_point, + const float requantization_scale, + const uint8_t* kernel, + const int32_t* bias); + + void* getPackedWeights() const + { + return packed_weights_; + } + + void unpackWeights( + const uint8_t* kernel_zero_points, + int8_t* kernel + ) const; + + size_t getInputChannels() const + { + return input_channels_; + } + + size_t getOutputChannels() const + { + return output_channels_; + } + + ~PackBMatrix() + { + if (packed_weights_ != nullptr) { + free(packed_weights_); + } + } + + PackBMatrix() = delete; + PackBMatrix(const PackBMatrix&) = delete; + PackBMatrix& operator=(const PackBMatrix&) = delete; + + private: + void* packed_weights_ = nullptr; + size_t input_channels_; + size_t output_channels_; +}; + +enum pytorch_qnnp_status qnnpackLinear( + const size_t batch_size, + const size_t input_channels, + const size_t output_channels, + const uint8_t input_zero_point, + const uint8_t* kernel_zero_points, + const float* requantization_scales, + const uint8_t output_zero_point, + const uint8_t output_min, + const uint8_t output_max, + const uint8_t* input, + const size_t input_stride, + void* packed_weights, + uint8_t* output, + const size_t output_stride, + pthreadpool_t threadpool); + +enum pytorch_qnnp_status qnnpackConv( + const pytorch_qnnp_operator_t convolution, + void* packed_weights, + const size_t batch_size, + const size_t input_depth, + const size_t input_height, + const size_t input_width, + const uint8_t input_zero_point, + const uint8_t* input, + const uint8_t* kernel_zero_points, + const float* requantization_scales, + const uint8_t output_zero_point, + const uint8_t output_min, + const uint8_t output_max, + uint8_t* output, + pthreadpool_t threadpool); + +enum pytorch_qnnp_status qnnpackDeConv( + const pytorch_qnnp_operator_t deconvolution, + void* packed_weights, + const size_t batch_size, + const size_t input_height, + const size_t input_width, + const uint8_t input_zero_point, + const uint8_t* input, + const uint8_t* kernel_zero_points, + const float* requantization_scales, + const uint8_t output_zero_point, + const uint8_t output_min, + const uint8_t output_max, + uint8_t* output, + pthreadpool_t threadpool); + +enum pytorch_qnnp_status qnnpackLinearDynamic( + const size_t batch_size, + const size_t input_channels, + const size_t output_channels, + const uint8_t input_zero_point, + const uint8_t* kernel_zero_points, + const float* dequantization_scales, + const uint8_t* input, + const size_t input_stride, + void* packed_weights, + const float* bias, + float* output, + const size_t output_stride, + pthreadpool_t threadpool); + +} // namespace qnnpack diff --git a/.venv/lib/python3.11/site-packages/torch/include/sleef.h b/.venv/lib/python3.11/site-packages/torch/include/sleef.h new file mode 100644 index 0000000000000000000000000000000000000000..292ac5b8be30c5766679ce2dd562014fdf50d4f2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/sleef.h @@ -0,0 +1,4170 @@ +// Copyright Naoki Shibata and contributors 2010 - 2023. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef __SLEEF_H__ +#define __SLEEF_H__ + +#define SLEEF_VERSION_MAJOR 3 +#define SLEEF_VERSION_MINOR 6 +#define SLEEF_VERSION_PATCHLEVEL 0 + +#include +#include + +#if defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER) +#define SLEEF_CONST __attribute__((const)) +#define SLEEF_INLINE __attribute__((always_inline)) +#elif defined(_MSC_VER) +#define SLEEF_CONST +#define SLEEF_INLINE __forceinline +#endif + +#if defined(__AVX2__) || defined(__aarch64__) || defined(__arm__) || defined(__powerpc64__) || defined(__zarch__) +#ifndef FP_FAST_FMA +#define FP_FAST_FMA +#endif +#ifndef FP_FAST_FMAF +#define FP_FAST_FMAF +#endif +#endif + +#if defined(_MSC_VER) && !defined(__STDC__) +#define __STDC__ 1 +#endif + +#if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS) +#ifdef SLEEF_IMPORT_IS_EXPORT +#define SLEEF_IMPORT __declspec(dllexport) +#else // #ifdef SLEEF_IMPORT_IS_EXPORT +#define SLEEF_IMPORT __declspec(dllimport) +#if (defined(_MSC_VER)) +#pragma comment(lib,"sleef.lib") +#endif // #if (defined(_MSC_VER)) +#endif // #ifdef SLEEF_IMPORT_IS_EXPORT +#else // #if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS) +#define SLEEF_IMPORT +#endif // #if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS) + +#if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__)) +#include +#endif + +#if (defined(_MSC_VER)) +#include +#endif + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#include +#endif + +#if defined(__ARM_FEATURE_SVE) +#include +#endif + +#if defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__) +#include +typedef __vector double SLEEF_VECTOR_DOUBLE; +typedef __vector float SLEEF_VECTOR_FLOAT; +typedef __vector int SLEEF_VECTOR_INT; +typedef __vector unsigned int SLEEF_VECTOR_UINT; +typedef __vector long long SLEEF_VECTOR_LONGLONG; +typedef __vector unsigned long long SLEEF_VECTOR_ULONGLONG; +#endif + +#if defined(__VX__) && defined(__VEC__) +#ifndef SLEEF_VECINTRIN_H_INCLUDED +#include +#define SLEEF_VECINTRIN_H_INCLUDED +#endif +typedef __vector double SLEEF_VECTOR_DOUBLE; +typedef __vector float SLEEF_VECTOR_FLOAT; +typedef __vector int SLEEF_VECTOR_INT; +typedef __vector unsigned int SLEEF_VECTOR_UINT; +typedef __vector long long SLEEF_VECTOR_LONGLONG; +typedef __vector unsigned long long SLEEF_VECTOR_ULONGLONG; +#endif + +// + +#if defined(SLEEF_ENABLE_OMP_SIMD) && (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) +#if defined(__aarch64__) +//#define SLEEF_PRAGMA_OMP_SIMD_DP _Pragma ("omp declare simd simdlen(2) notinbranch") +//#define SLEEF_PRAGMA_OMP_SIMD_SP _Pragma ("omp declare simd simdlen(4) notinbranch") +//#elif defined(__x86_64__) && defined(__AVX512F__) +//#define SLEEF_PRAGMA_OMP_SIMD_DP _Pragma ("omp declare simd simdlen(8) notinbranch") +//#define SLEEF_PRAGMA_OMP_SIMD_SP _Pragma ("omp declare simd simdlen(16) notinbranch") +#elif defined(__x86_64__) && defined(__AVX__) +#define SLEEF_PRAGMA_OMP_SIMD_DP _Pragma ("omp declare simd simdlen(4) notinbranch") +#define SLEEF_PRAGMA_OMP_SIMD_SP _Pragma ("omp declare simd simdlen(8) notinbranch") +#elif defined(__x86_64__) && defined(__SSE2__) +#define SLEEF_PRAGMA_OMP_SIMD_DP _Pragma ("omp declare simd simdlen(2) notinbranch") +#define SLEEF_PRAGMA_OMP_SIMD_SP _Pragma ("omp declare simd simdlen(4) notinbranch") +#endif +#endif + +#ifndef SLEEF_PRAGMA_OMP_SIMD_DP +#define SLEEF_PRAGMA_OMP_SIMD_DP +#define SLEEF_PRAGMA_OMP_SIMD_SP +#endif + +// + +#ifndef SLEEF_FP_ILOGB0 +#define SLEEF_FP_ILOGB0 ((int)0x80000000) +#endif + +#ifndef SLEEF_FP_ILOGBNAN +#define SLEEF_FP_ILOGBNAN ((int)2147483647) +#endif + +// + +SLEEF_IMPORT void *Sleef_malloc(size_t z); +SLEEF_IMPORT void Sleef_free(void *ptr); +SLEEF_IMPORT uint64_t Sleef_currentTimeMicros(); + +#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) +SLEEF_IMPORT void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx); +#endif + +// + +#if defined(__riscv_v) +#include +typedef vfloat64m2_t Sleef_vfloat64m1_t_2; +typedef vfloat32m2_t Sleef_vfloat32m1_t_2; +typedef vfloat64m4_t Sleef_vfloat64m2_t_2; +typedef vfloat32m4_t Sleef_vfloat32m2_t_2; +#define Sleef_vfloat64m1_t_2_DEFINED +#define Sleef_vfloat32m1_t_2_DEFINED +#define Sleef_vfloat64m2_t_2_DEFINED +#define Sleef_vfloat32m2_t_2_DEFINED +#endif + +#ifndef Sleef_double2_DEFINED +#define Sleef_double2_DEFINED +typedef struct { + double x, y; +} Sleef_double2; +#endif + +#ifndef Sleef_float2_DEFINED +#define Sleef_float2_DEFINED +typedef struct { + float x, y; +} Sleef_float2; +#endif + +#ifdef __cplusplus +extern "C" +{ +#endif + +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sin_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cos_u35(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double2 Sleef_sincos_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_tan_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_asin_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_acos_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_atan_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_atan2_u35(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_log_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cbrt_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sin_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cos_u10(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double2 Sleef_sincos_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_tan_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_asin_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_acos_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_atan_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_atan2_u10(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_log_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cbrt_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_exp_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_pow_u10(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sinh_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cosh_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_tanh_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sinh_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cosh_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_tanh_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_asinh_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_acosh_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_atanh_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_exp2_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_exp10_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_exp2_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_exp10_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_expm1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_log10_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_log2_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_log2_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_log1p_u10(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double2 Sleef_sincospi_u05(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double2 Sleef_sincospi_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sinpi_u05(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cospi_u05(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_ldexp(double, int); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST int Sleef_ilogb(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fma(double, double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sqrt(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sqrt_u05(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sqrt_u35(double); + +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_hypot_u05(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_hypot_u35(double, double); + +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fabs(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_copysign(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fmax(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fmin(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fdim(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_trunc(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_floor(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_ceil(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_round(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_rint(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_nextafter(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_frfrexp(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST int Sleef_expfrexp(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fmod(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_remainder(double, double); +SLEEF_IMPORT SLEEF_CONST Sleef_double2 Sleef_modf(double); + +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_lgamma_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_tgamma_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_erf_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_erfc_u15(double); + +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sinf_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_cosf_u35(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float2 Sleef_sincosf_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_tanf_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_asinf_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_acosf_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_atanf_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_atan2f_u35(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_logf_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_cbrtf_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sinf_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_cosf_u10(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float2 Sleef_sincosf_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fastsinf_u3500(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fastcosf_u3500(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_tanf_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_asinf_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_acosf_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_atanf_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_atan2f_u10(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_logf_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_cbrtf_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_expf_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_powf_u10(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fastpowf_u3500(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sinhf_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_coshf_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_tanhf_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sinhf_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_coshf_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_tanhf_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_asinhf_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_acoshf_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_atanhf_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_exp2f_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_exp10f_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_exp2f_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_exp10f_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_expm1f_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_log10f_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_log2f_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_log2f_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_log1pf_u10(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float2 Sleef_sincospif_u05(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float2 Sleef_sincospif_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sinpif_u05(float d); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_cospif_u05(float d); +SLEEF_IMPORT SLEEF_CONST float Sleef_ldexpf(float, int); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST int Sleef_ilogbf(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fmaf(float, float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf_u05(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf_u35(float); + +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_hypotf_u05(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_hypotf_u35(float, float); + +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fabsf(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_copysignf(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fmaxf(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fminf(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fdimf(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_truncf(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_floorf(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_ceilf(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_roundf(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_rintf(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_nextafterf(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_frfrexpf(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST int Sleef_expfrexpf(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fmodf(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_remainderf(float, float); +SLEEF_IMPORT SLEEF_CONST Sleef_float2 Sleef_modff(float); + +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_lgammaf_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_tgammaf_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_erff_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_erfcf_u15(float); +#ifdef __SSE2__ + +#ifndef Sleef___m128d_2_DEFINED +typedef struct { + __m128d x, y; +} Sleef___m128d_2; +#define Sleef___m128d_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sind2_u35(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cosd2_u35(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincosd2_u35(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tand2_u35(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asind2_u35(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acosd2_u35(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atand2_u35(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atan2d2_u35(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_logd2_u35(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cbrtd2_u35(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sind2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cosd2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincosd2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tand2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asind2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acosd2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atand2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atan2d2_u10(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_logd2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cbrtd2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_expd2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_powd2_u10(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinhd2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_coshd2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tanhd2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinhd2_u35(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_coshd2_u35(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tanhd2_u35(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastsind2_u3500(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastcosd2_u3500(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastpowd2_u3500(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asinhd2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acoshd2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atanhd2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp2d2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp2d2_u35(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp10d2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp10d2_u35(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_expm1d2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log10d2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log2d2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log2d2_u35(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log1pd2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincospid2_u05(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincospid2_u35(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinpid2_u05(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cospid2_u05(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_ldexpd2(__m128d, __m128i); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_ilogbd2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmad2(__m128d, __m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_u05(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_u35(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_hypotd2_u05(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_hypotd2_u35(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fabsd2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_copysignd2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmaxd2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmind2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fdimd2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_truncd2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_floord2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_ceild2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_roundd2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_rintd2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_nextafterd2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_frfrexpd2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_expfrexpd2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmodd2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_remainderd2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_modfd2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_lgammad2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tgammad2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_erfd2_u10(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_erfcd2_u15(__m128d); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd2(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd2(int); + +#ifndef Sleef___m128_2_DEFINED +typedef struct { + __m128 x, y; +} Sleef___m128_2; +#define Sleef___m128_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinf4_u35(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cosf4_u35(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincosf4_u35(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanf4_u35(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinf4_u35(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acosf4_u35(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanf4_u35(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atan2f4_u35(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_logf4_u35(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cbrtf4_u35(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinf4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cosf4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincosf4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanf4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinf4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acosf4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanf4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atan2f4_u10(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_logf4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cbrtf4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_expf4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_powf4_u10(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinhf4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_coshf4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanhf4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinhf4_u35(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_coshf4_u35(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanhf4_u35(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastsinf4_u3500(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastcosf4_u3500(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastpowf4_u3500(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinhf4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acoshf4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanhf4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp2f4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp2f4_u35(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp10f4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp10f4_u35(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_expm1f4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log10f4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log2f4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log2f4_u35(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log1pf4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincospif4_u05(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincospif4_u35(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinpif4_u05(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cospif4_u05(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmaf4(__m128, __m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_u05(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_u35(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_hypotf4_u05(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_hypotf4_u35(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fabsf4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_copysignf4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmaxf4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fminf4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fdimf4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_truncf4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_floorf4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_ceilf4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_roundf4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_rintf4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_nextafterf4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_frfrexpf4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmodf4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_remainderf4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_modff4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_lgammaf4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tgammaf4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_erff4_u10(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_erfcf4_u15(__m128); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf4(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf4(int); +#endif +#ifdef __SSE2__ + +#ifndef Sleef___m128d_2_DEFINED +typedef struct { + __m128d x, y; +} Sleef___m128d_2; +#define Sleef___m128d_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sind2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sind2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cosd2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_cosd2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincosd2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_cinz_sincosd2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tand2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_tand2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asind2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_asind2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acosd2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_acosd2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atand2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_atand2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atan2d2_u35sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_atan2d2_u35sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_logd2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_logd2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cbrtd2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_cbrtd2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sind2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sind2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cosd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_cosd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincosd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_cinz_sincosd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tand2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_tand2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asind2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_asind2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acosd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_acosd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atand2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_atand2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atan2d2_u10sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_atan2d2_u10sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_logd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_logd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cbrtd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_cbrtd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_expd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_expd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_powd2_u10sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_powd2_u10sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinhd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sinhd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_coshd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_coshd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tanhd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_tanhd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinhd2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sinhd2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_coshd2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_coshd2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tanhd2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_tanhd2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastsind2_u3500sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fastsind2_u3500sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastcosd2_u3500sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fastcosd2_u3500sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastpowd2_u3500sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fastpowd2_u3500sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asinhd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_asinhd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acoshd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_acoshd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atanhd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_atanhd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp2d2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_exp2d2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp2d2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_exp2d2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp10d2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_exp10d2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp10d2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_exp10d2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_expm1d2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_expm1d2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log10d2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_log10d2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log2d2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_log2d2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log2d2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_log2d2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log1pd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_log1pd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincospid2_u05sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_cinz_sincospid2_u05sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincospid2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_cinz_sincospid2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinpid2_u05sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sinpid2_u05sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cospid2_u05sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_cospid2_u05sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_ldexpd2_sse2(__m128d, __m128i); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_ldexpd2_sse2(__m128d, __m128i); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_ilogbd2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_cinz_ilogbd2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmad2_sse2(__m128d, __m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fmad2_sse2(__m128d, __m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sqrtd2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_u05sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sqrtd2_u05sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sqrtd2_u35sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_hypotd2_u05sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_hypotd2_u05sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_hypotd2_u35sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_hypotd2_u35sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fabsd2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fabsd2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_copysignd2_sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_copysignd2_sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmaxd2_sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fmaxd2_sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmind2_sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fmind2_sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fdimd2_sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fdimd2_sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_truncd2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_truncd2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_floord2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_floord2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_ceild2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_ceild2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_roundd2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_roundd2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_rintd2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_rintd2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_nextafterd2_sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_nextafterd2_sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_frfrexpd2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_frfrexpd2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_expfrexpd2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_cinz_expfrexpd2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmodd2_sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fmodd2_sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_remainderd2_sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_remainderd2_sse2(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_modfd2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_cinz_modfd2_sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_lgammad2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_lgammad2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tgammad2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_tgammad2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_erfd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_erfd2_u10sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_erfcd2_u15sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_erfcd2_u15sse2(__m128d); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd2_sse2(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd2_sse2(int); + +#ifndef Sleef___m128_2_DEFINED +typedef struct { + __m128 x, y; +} Sleef___m128_2; +#define Sleef___m128_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sinf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cosf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_cosf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincosf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_cinz_sincosf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_tanf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_asinf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acosf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_acosf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_atanf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atan2f4_u35sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_atan2f4_u35sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_logf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_logf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cbrtf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_cbrtf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sinf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cosf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_cosf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincosf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_cinz_sincosf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_tanf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_asinf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acosf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_acosf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_atanf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atan2f4_u10sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_atan2f4_u10sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_logf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_logf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cbrtf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_cbrtf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_expf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_expf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_powf4_u10sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_powf4_u10sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinhf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sinhf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_coshf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_coshf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanhf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_tanhf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinhf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sinhf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_coshf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_coshf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanhf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_tanhf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastsinf4_u3500sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fastsinf4_u3500sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastcosf4_u3500sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fastcosf4_u3500sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastpowf4_u3500sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fastpowf4_u3500sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinhf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_asinhf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acoshf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_acoshf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanhf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_atanhf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp2f4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_exp2f4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp2f4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_exp2f4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp10f4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_exp10f4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp10f4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_exp10f4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_expm1f4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_expm1f4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log10f4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_log10f4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log2f4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_log2f4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log2f4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_log2f4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log1pf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_log1pf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincospif4_u05sse2(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_cinz_sincospif4_u05sse2(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincospif4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_cinz_sincospif4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinpif4_u05sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sinpif4_u05sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cospif4_u05sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_cospif4_u05sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmaf4_sse2(__m128, __m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fmaf4_sse2(__m128, __m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sqrtf4_sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_u05sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sqrtf4_u05sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sqrtf4_u35sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_hypotf4_u05sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_hypotf4_u05sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_hypotf4_u35sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_hypotf4_u35sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fabsf4_sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fabsf4_sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_copysignf4_sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_copysignf4_sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmaxf4_sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fmaxf4_sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fminf4_sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fminf4_sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fdimf4_sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fdimf4_sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_truncf4_sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_truncf4_sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_floorf4_sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_floorf4_sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_ceilf4_sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_ceilf4_sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_roundf4_sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_roundf4_sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_rintf4_sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_rintf4_sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_nextafterf4_sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_nextafterf4_sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_frfrexpf4_sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_frfrexpf4_sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmodf4_sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fmodf4_sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_remainderf4_sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_remainderf4_sse2(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_modff4_sse2(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_cinz_modff4_sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_lgammaf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_lgammaf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tgammaf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_tgammaf4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_erff4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_erff4_u10sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_erfcf4_u15sse2(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_erfcf4_u15sse2(__m128); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf4_sse2(int); +SLEEF_IMPORT SLEEF_CONST int Sleef_cinz_getIntf4_sse2(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf4_sse2(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_cinz_getPtrf4_sse2(int); +#endif +#ifdef __SSE2__ + +#ifndef Sleef___m128d_2_DEFINED +typedef struct { + __m128d x, y; +} Sleef___m128d_2; +#define Sleef___m128d_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sind2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sind2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cosd2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_cosd2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincosd2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_cinz_sincosd2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tand2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_tand2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asind2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_asind2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acosd2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_acosd2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atand2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_atand2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atan2d2_u35sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_atan2d2_u35sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_logd2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_logd2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cbrtd2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_cbrtd2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sind2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sind2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cosd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_cosd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincosd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_cinz_sincosd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tand2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_tand2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asind2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_asind2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acosd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_acosd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atand2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_atand2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atan2d2_u10sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_atan2d2_u10sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_logd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_logd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cbrtd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_cbrtd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_expd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_expd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_powd2_u10sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_powd2_u10sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinhd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sinhd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_coshd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_coshd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tanhd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_tanhd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinhd2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sinhd2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_coshd2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_coshd2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tanhd2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_tanhd2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastsind2_u3500sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fastsind2_u3500sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastcosd2_u3500sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fastcosd2_u3500sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastpowd2_u3500sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fastpowd2_u3500sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asinhd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_asinhd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acoshd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_acoshd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atanhd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_atanhd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp2d2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_exp2d2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp2d2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_exp2d2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp10d2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_exp10d2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp10d2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_exp10d2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_expm1d2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_expm1d2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log10d2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_log10d2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log2d2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_log2d2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log2d2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_log2d2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log1pd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_log1pd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincospid2_u05sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_cinz_sincospid2_u05sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincospid2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_cinz_sincospid2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinpid2_u05sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sinpid2_u05sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cospid2_u05sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_cospid2_u05sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_ldexpd2_sse4(__m128d, __m128i); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_ldexpd2_sse4(__m128d, __m128i); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_ilogbd2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_cinz_ilogbd2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmad2_sse4(__m128d, __m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fmad2_sse4(__m128d, __m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sqrtd2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_u05sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sqrtd2_u05sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sqrtd2_u35sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_hypotd2_u05sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_hypotd2_u05sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_hypotd2_u35sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_hypotd2_u35sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fabsd2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fabsd2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_copysignd2_sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_copysignd2_sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmaxd2_sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fmaxd2_sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmind2_sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fmind2_sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fdimd2_sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fdimd2_sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_truncd2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_truncd2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_floord2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_floord2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_ceild2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_ceild2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_roundd2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_roundd2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_rintd2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_rintd2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_nextafterd2_sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_nextafterd2_sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_frfrexpd2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_frfrexpd2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_expfrexpd2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_cinz_expfrexpd2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmodd2_sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fmodd2_sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_remainderd2_sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_remainderd2_sse4(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_modfd2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_cinz_modfd2_sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_lgammad2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_lgammad2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tgammad2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_tgammad2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_erfd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_erfd2_u10sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_erfcd2_u15sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_erfcd2_u15sse4(__m128d); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd2_sse4(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd2_sse4(int); + +#ifndef Sleef___m128_2_DEFINED +typedef struct { + __m128 x, y; +} Sleef___m128_2; +#define Sleef___m128_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sinf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cosf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_cosf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincosf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_cinz_sincosf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_tanf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_asinf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acosf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_acosf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_atanf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atan2f4_u35sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_atan2f4_u35sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_logf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_logf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cbrtf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_cbrtf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sinf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cosf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_cosf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincosf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_cinz_sincosf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_tanf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_asinf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acosf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_acosf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_atanf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atan2f4_u10sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_atan2f4_u10sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_logf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_logf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cbrtf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_cbrtf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_expf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_expf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_powf4_u10sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_powf4_u10sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinhf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sinhf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_coshf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_coshf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanhf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_tanhf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinhf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sinhf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_coshf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_coshf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanhf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_tanhf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastsinf4_u3500sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fastsinf4_u3500sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastcosf4_u3500sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fastcosf4_u3500sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastpowf4_u3500sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fastpowf4_u3500sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinhf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_asinhf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acoshf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_acoshf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanhf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_atanhf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp2f4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_exp2f4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp2f4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_exp2f4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp10f4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_exp10f4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp10f4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_exp10f4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_expm1f4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_expm1f4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log10f4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_log10f4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log2f4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_log2f4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log2f4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_log2f4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log1pf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_log1pf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincospif4_u05sse4(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_cinz_sincospif4_u05sse4(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincospif4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_cinz_sincospif4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinpif4_u05sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sinpif4_u05sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cospif4_u05sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_cospif4_u05sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmaf4_sse4(__m128, __m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fmaf4_sse4(__m128, __m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sqrtf4_sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_u05sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sqrtf4_u05sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sqrtf4_u35sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_hypotf4_u05sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_hypotf4_u05sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_hypotf4_u35sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_hypotf4_u35sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fabsf4_sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fabsf4_sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_copysignf4_sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_copysignf4_sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmaxf4_sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fmaxf4_sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fminf4_sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fminf4_sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fdimf4_sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fdimf4_sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_truncf4_sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_truncf4_sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_floorf4_sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_floorf4_sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_ceilf4_sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_ceilf4_sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_roundf4_sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_roundf4_sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_rintf4_sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_rintf4_sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_nextafterf4_sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_nextafterf4_sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_frfrexpf4_sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_frfrexpf4_sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmodf4_sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fmodf4_sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_remainderf4_sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_remainderf4_sse4(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_modff4_sse4(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_cinz_modff4_sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_lgammaf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_lgammaf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tgammaf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_tgammaf4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_erff4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_erff4_u10sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_erfcf4_u15sse4(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_erfcf4_u15sse4(__m128); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf4_sse4(int); +SLEEF_IMPORT SLEEF_CONST int Sleef_cinz_getIntf4_sse4(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf4_sse4(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_cinz_getPtrf4_sse4(int); +#endif +#ifdef __AVX__ + +#ifndef Sleef___m256d_2_DEFINED +typedef struct { + __m256d x, y; +} Sleef___m256d_2; +#define Sleef___m256d_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sind4_u35(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cosd4_u35(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincosd4_u35(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tand4_u35(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asind4_u35(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acosd4_u35(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atand4_u35(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atan2d4_u35(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_logd4_u35(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cbrtd4_u35(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sind4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cosd4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincosd4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tand4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asind4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acosd4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atand4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atan2d4_u10(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_logd4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cbrtd4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_expd4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_powd4_u10(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinhd4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_coshd4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tanhd4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinhd4_u35(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_coshd4_u35(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tanhd4_u35(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastsind4_u3500(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastcosd4_u3500(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastpowd4_u3500(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asinhd4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acoshd4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atanhd4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp2d4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp2d4_u35(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp10d4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp10d4_u35(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_expm1d4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log10d4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log2d4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log2d4_u35(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log1pd4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincospid4_u05(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincospid4_u35(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinpid4_u05(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cospid4_u05(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_ldexpd4(__m256d, __m128i); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_ilogbd4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmad4(__m256d, __m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_u05(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_u35(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_hypotd4_u05(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_hypotd4_u35(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fabsd4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_copysignd4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmaxd4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmind4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fdimd4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_truncd4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_floord4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_ceild4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_roundd4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_rintd4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_nextafterd4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_frfrexpd4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_expfrexpd4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmodd4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_remainderd4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_modfd4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_lgammad4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tgammad4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_erfd4_u10(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_erfcd4_u15(__m256d); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd4(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd4(int); + +#ifndef Sleef___m256_2_DEFINED +typedef struct { + __m256 x, y; +} Sleef___m256_2; +#define Sleef___m256_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinf8_u35(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cosf8_u35(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincosf8_u35(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanf8_u35(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinf8_u35(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acosf8_u35(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanf8_u35(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atan2f8_u35(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_logf8_u35(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cbrtf8_u35(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinf8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cosf8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincosf8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanf8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinf8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acosf8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanf8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atan2f8_u10(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_logf8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cbrtf8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_expf8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_powf8_u10(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinhf8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_coshf8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanhf8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinhf8_u35(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_coshf8_u35(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanhf8_u35(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastsinf8_u3500(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastcosf8_u3500(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastpowf8_u3500(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinhf8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acoshf8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanhf8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp2f8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp2f8_u35(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp10f8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp10f8_u35(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_expm1f8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log10f8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log2f8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log2f8_u35(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log1pf8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincospif8_u05(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincospif8_u35(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinpif8_u05(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cospif8_u05(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmaf8(__m256, __m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_u05(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_u35(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_hypotf8_u05(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_hypotf8_u35(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fabsf8(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_copysignf8(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmaxf8(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fminf8(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fdimf8(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_truncf8(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_floorf8(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_ceilf8(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_roundf8(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_rintf8(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_nextafterf8(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_frfrexpf8(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmodf8(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_remainderf8(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_modff8(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_lgammaf8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tgammaf8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_erff8_u10(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_erfcf8_u15(__m256); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf8(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf8(int); +#endif +#ifdef __AVX__ + +#ifndef Sleef___m256d_2_DEFINED +typedef struct { + __m256d x, y; +} Sleef___m256d_2; +#define Sleef___m256d_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sind4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_sind4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cosd4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_cosd4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincosd4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_cinz_sincosd4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tand4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_tand4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asind4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_asind4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acosd4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_acosd4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atand4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_atand4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atan2d4_u35avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_atan2d4_u35avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_logd4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_logd4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cbrtd4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_cbrtd4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sind4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_sind4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cosd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_cosd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincosd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_cinz_sincosd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tand4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_tand4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asind4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_asind4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acosd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_acosd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atand4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_atand4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atan2d4_u10avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_atan2d4_u10avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_logd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_logd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cbrtd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_cbrtd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_expd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_expd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_powd4_u10avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_powd4_u10avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinhd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_sinhd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_coshd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_coshd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tanhd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_tanhd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinhd4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_sinhd4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_coshd4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_coshd4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tanhd4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_tanhd4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastsind4_u3500avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_fastsind4_u3500avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastcosd4_u3500avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_fastcosd4_u3500avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastpowd4_u3500avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_fastpowd4_u3500avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asinhd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_asinhd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acoshd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_acoshd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atanhd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_atanhd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp2d4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_exp2d4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp2d4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_exp2d4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp10d4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_exp10d4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp10d4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_exp10d4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_expm1d4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_expm1d4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log10d4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_log10d4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log2d4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_log2d4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log2d4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_log2d4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log1pd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_log1pd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincospid4_u05avx(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_cinz_sincospid4_u05avx(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincospid4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_cinz_sincospid4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinpid4_u05avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_sinpid4_u05avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cospid4_u05avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_cospid4_u05avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_ldexpd4_avx(__m256d, __m128i); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_ldexpd4_avx(__m256d, __m128i); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_ilogbd4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_cinz_ilogbd4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmad4_avx(__m256d, __m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_fmad4_avx(__m256d, __m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_sqrtd4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_u05avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_sqrtd4_u05avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_sqrtd4_u35avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_hypotd4_u05avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_hypotd4_u05avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_hypotd4_u35avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_hypotd4_u35avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fabsd4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_fabsd4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_copysignd4_avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_copysignd4_avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmaxd4_avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_fmaxd4_avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmind4_avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_fmind4_avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fdimd4_avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_fdimd4_avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_truncd4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_truncd4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_floord4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_floord4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_ceild4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_ceild4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_roundd4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_roundd4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_rintd4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_rintd4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_nextafterd4_avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_nextafterd4_avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_frfrexpd4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_frfrexpd4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_expfrexpd4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_cinz_expfrexpd4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmodd4_avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_fmodd4_avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_remainderd4_avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_remainderd4_avx(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_modfd4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_cinz_modfd4_avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_lgammad4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_lgammad4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tgammad4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_tgammad4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_erfd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_erfd4_u10avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_erfcd4_u15avx(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_erfcd4_u15avx(__m256d); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd4_avx(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd4_avx(int); + +#ifndef Sleef___m256_2_DEFINED +typedef struct { + __m256 x, y; +} Sleef___m256_2; +#define Sleef___m256_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_sinf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cosf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_cosf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincosf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_cinz_sincosf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_tanf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_asinf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acosf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_acosf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_atanf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atan2f8_u35avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_atan2f8_u35avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_logf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_logf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cbrtf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_cbrtf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_sinf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cosf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_cosf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincosf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_cinz_sincosf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_tanf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_asinf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acosf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_acosf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_atanf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atan2f8_u10avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_atan2f8_u10avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_logf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_logf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cbrtf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_cbrtf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_expf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_expf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_powf8_u10avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_powf8_u10avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinhf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_sinhf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_coshf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_coshf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanhf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_tanhf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinhf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_sinhf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_coshf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_coshf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanhf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_tanhf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastsinf8_u3500avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_fastsinf8_u3500avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastcosf8_u3500avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_fastcosf8_u3500avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastpowf8_u3500avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_fastpowf8_u3500avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinhf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_asinhf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acoshf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_acoshf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanhf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_atanhf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp2f8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_exp2f8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp2f8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_exp2f8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp10f8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_exp10f8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp10f8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_exp10f8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_expm1f8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_expm1f8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log10f8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_log10f8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log2f8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_log2f8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log2f8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_log2f8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log1pf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_log1pf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincospif8_u05avx(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_cinz_sincospif8_u05avx(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincospif8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_cinz_sincospif8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinpif8_u05avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_sinpif8_u05avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cospif8_u05avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_cospif8_u05avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmaf8_avx(__m256, __m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_fmaf8_avx(__m256, __m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_sqrtf8_avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_u05avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_sqrtf8_u05avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_sqrtf8_u35avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_hypotf8_u05avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_hypotf8_u05avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_hypotf8_u35avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_hypotf8_u35avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fabsf8_avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_fabsf8_avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_copysignf8_avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_copysignf8_avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmaxf8_avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_fmaxf8_avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fminf8_avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_fminf8_avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fdimf8_avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_fdimf8_avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_truncf8_avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_truncf8_avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_floorf8_avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_floorf8_avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_ceilf8_avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_ceilf8_avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_roundf8_avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_roundf8_avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_rintf8_avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_rintf8_avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_nextafterf8_avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_nextafterf8_avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_frfrexpf8_avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_frfrexpf8_avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmodf8_avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_fmodf8_avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_remainderf8_avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_remainderf8_avx(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_modff8_avx(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_cinz_modff8_avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_lgammaf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_lgammaf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tgammaf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_tgammaf8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_erff8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_erff8_u10avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_erfcf8_u15avx(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_erfcf8_u15avx(__m256); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf8_avx(int); +SLEEF_IMPORT SLEEF_CONST int Sleef_cinz_getIntf8_avx(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf8_avx(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_cinz_getPtrf8_avx(int); +#endif +#ifdef __AVX__ + +#ifndef Sleef___m256d_2_DEFINED +typedef struct { + __m256d x, y; +} Sleef___m256d_2; +#define Sleef___m256d_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sind4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sind4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cosd4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_cosd4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincosd4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_finz_sincosd4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tand4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_tand4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asind4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_asind4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acosd4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_acosd4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atand4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_atand4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atan2d4_u35fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_atan2d4_u35fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_logd4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_logd4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cbrtd4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_cbrtd4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sind4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sind4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cosd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_cosd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincosd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_finz_sincosd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tand4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_tand4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asind4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_asind4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acosd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_acosd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atand4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_atand4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atan2d4_u10fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_atan2d4_u10fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_logd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_logd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cbrtd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_cbrtd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_expd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_expd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_powd4_u10fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_powd4_u10fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinhd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sinhd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_coshd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_coshd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tanhd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_tanhd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinhd4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sinhd4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_coshd4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_coshd4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tanhd4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_tanhd4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastsind4_u3500fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fastsind4_u3500fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastcosd4_u3500fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fastcosd4_u3500fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastpowd4_u3500fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fastpowd4_u3500fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asinhd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_asinhd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acoshd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_acoshd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atanhd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_atanhd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp2d4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_exp2d4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp2d4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_exp2d4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp10d4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_exp10d4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp10d4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_exp10d4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_expm1d4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_expm1d4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log10d4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_log10d4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log2d4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_log2d4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log2d4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_log2d4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log1pd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_log1pd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincospid4_u05fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_finz_sincospid4_u05fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincospid4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_finz_sincospid4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinpid4_u05fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sinpid4_u05fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cospid4_u05fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_cospid4_u05fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_ldexpd4_fma4(__m256d, __m128i); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_ldexpd4_fma4(__m256d, __m128i); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_ilogbd4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_finz_ilogbd4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmad4_fma4(__m256d, __m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fmad4_fma4(__m256d, __m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sqrtd4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_u05fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sqrtd4_u05fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sqrtd4_u35fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_hypotd4_u05fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_hypotd4_u05fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_hypotd4_u35fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_hypotd4_u35fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fabsd4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fabsd4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_copysignd4_fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_copysignd4_fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmaxd4_fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fmaxd4_fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmind4_fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fmind4_fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fdimd4_fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fdimd4_fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_truncd4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_truncd4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_floord4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_floord4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_ceild4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_ceild4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_roundd4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_roundd4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_rintd4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_rintd4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_nextafterd4_fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_nextafterd4_fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_frfrexpd4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_frfrexpd4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_expfrexpd4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_finz_expfrexpd4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmodd4_fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fmodd4_fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_remainderd4_fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_remainderd4_fma4(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_modfd4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_finz_modfd4_fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_lgammad4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_lgammad4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tgammad4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_tgammad4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_erfd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_erfd4_u10fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_erfcd4_u15fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_erfcd4_u15fma4(__m256d); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd4_fma4(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd4_fma4(int); + +#ifndef Sleef___m256_2_DEFINED +typedef struct { + __m256 x, y; +} Sleef___m256_2; +#define Sleef___m256_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sinf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cosf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_cosf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincosf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_finz_sincosf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_tanf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_asinf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acosf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_acosf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_atanf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atan2f8_u35fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_atan2f8_u35fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_logf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_logf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cbrtf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_cbrtf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sinf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cosf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_cosf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincosf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_finz_sincosf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_tanf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_asinf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acosf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_acosf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_atanf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atan2f8_u10fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_atan2f8_u10fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_logf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_logf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cbrtf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_cbrtf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_expf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_expf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_powf8_u10fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_powf8_u10fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinhf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sinhf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_coshf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_coshf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanhf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_tanhf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinhf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sinhf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_coshf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_coshf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanhf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_tanhf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastsinf8_u3500fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fastsinf8_u3500fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastcosf8_u3500fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fastcosf8_u3500fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastpowf8_u3500fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fastpowf8_u3500fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinhf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_asinhf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acoshf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_acoshf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanhf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_atanhf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp2f8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_exp2f8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp2f8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_exp2f8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp10f8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_exp10f8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp10f8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_exp10f8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_expm1f8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_expm1f8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log10f8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_log10f8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log2f8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_log2f8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log2f8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_log2f8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log1pf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_log1pf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincospif8_u05fma4(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_finz_sincospif8_u05fma4(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincospif8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_finz_sincospif8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinpif8_u05fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sinpif8_u05fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cospif8_u05fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_cospif8_u05fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmaf8_fma4(__m256, __m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fmaf8_fma4(__m256, __m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sqrtf8_fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_u05fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sqrtf8_u05fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sqrtf8_u35fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_hypotf8_u05fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_hypotf8_u05fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_hypotf8_u35fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_hypotf8_u35fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fabsf8_fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fabsf8_fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_copysignf8_fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_copysignf8_fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmaxf8_fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fmaxf8_fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fminf8_fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fminf8_fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fdimf8_fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fdimf8_fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_truncf8_fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_truncf8_fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_floorf8_fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_floorf8_fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_ceilf8_fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_ceilf8_fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_roundf8_fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_roundf8_fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_rintf8_fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_rintf8_fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_nextafterf8_fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_nextafterf8_fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_frfrexpf8_fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_frfrexpf8_fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmodf8_fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fmodf8_fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_remainderf8_fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_remainderf8_fma4(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_modff8_fma4(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_finz_modff8_fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_lgammaf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_lgammaf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tgammaf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_tgammaf8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_erff8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_erff8_u10fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_erfcf8_u15fma4(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_erfcf8_u15fma4(__m256); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf8_fma4(int); +SLEEF_IMPORT SLEEF_CONST int Sleef_finz_getIntf8_fma4(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf8_fma4(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_finz_getPtrf8_fma4(int); +#endif +#ifdef __AVX__ + +#ifndef Sleef___m256d_2_DEFINED +typedef struct { + __m256d x, y; +} Sleef___m256d_2; +#define Sleef___m256d_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sind4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sind4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cosd4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_cosd4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincosd4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_finz_sincosd4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tand4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_tand4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asind4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_asind4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acosd4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_acosd4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atand4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_atand4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atan2d4_u35avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_atan2d4_u35avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_logd4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_logd4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cbrtd4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_cbrtd4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sind4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sind4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cosd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_cosd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincosd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_finz_sincosd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tand4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_tand4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asind4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_asind4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acosd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_acosd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atand4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_atand4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atan2d4_u10avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_atan2d4_u10avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_logd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_logd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cbrtd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_cbrtd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_expd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_expd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_powd4_u10avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_powd4_u10avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinhd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sinhd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_coshd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_coshd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tanhd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_tanhd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinhd4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sinhd4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_coshd4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_coshd4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tanhd4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_tanhd4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastsind4_u3500avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fastsind4_u3500avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastcosd4_u3500avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fastcosd4_u3500avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastpowd4_u3500avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fastpowd4_u3500avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asinhd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_asinhd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acoshd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_acoshd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atanhd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_atanhd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp2d4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_exp2d4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp2d4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_exp2d4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp10d4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_exp10d4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp10d4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_exp10d4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_expm1d4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_expm1d4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log10d4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_log10d4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log2d4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_log2d4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log2d4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_log2d4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log1pd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_log1pd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincospid4_u05avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_finz_sincospid4_u05avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincospid4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_finz_sincospid4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinpid4_u05avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sinpid4_u05avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cospid4_u05avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_cospid4_u05avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_ldexpd4_avx2(__m256d, __m128i); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_ldexpd4_avx2(__m256d, __m128i); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_ilogbd4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_finz_ilogbd4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmad4_avx2(__m256d, __m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fmad4_avx2(__m256d, __m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sqrtd4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_u05avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sqrtd4_u05avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sqrtd4_u35avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_hypotd4_u05avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_hypotd4_u05avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_hypotd4_u35avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_hypotd4_u35avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fabsd4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fabsd4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_copysignd4_avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_copysignd4_avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmaxd4_avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fmaxd4_avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmind4_avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fmind4_avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fdimd4_avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fdimd4_avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_truncd4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_truncd4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_floord4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_floord4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_ceild4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_ceild4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_roundd4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_roundd4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_rintd4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_rintd4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_nextafterd4_avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_nextafterd4_avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_frfrexpd4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_frfrexpd4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_expfrexpd4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_finz_expfrexpd4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmodd4_avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fmodd4_avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_remainderd4_avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_remainderd4_avx2(__m256d, __m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_modfd4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_finz_modfd4_avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_lgammad4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_lgammad4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tgammad4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_tgammad4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_erfd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_erfd4_u10avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_erfcd4_u15avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_erfcd4_u15avx2(__m256d); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd4_avx2(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd4_avx2(int); + +#ifndef Sleef___m256_2_DEFINED +typedef struct { + __m256 x, y; +} Sleef___m256_2; +#define Sleef___m256_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sinf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cosf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_cosf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincosf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_finz_sincosf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_tanf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_asinf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acosf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_acosf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_atanf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atan2f8_u35avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_atan2f8_u35avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_logf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_logf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cbrtf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_cbrtf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sinf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cosf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_cosf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincosf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_finz_sincosf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_tanf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_asinf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acosf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_acosf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_atanf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atan2f8_u10avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_atan2f8_u10avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_logf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_logf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cbrtf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_cbrtf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_expf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_expf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_powf8_u10avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_powf8_u10avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinhf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sinhf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_coshf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_coshf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanhf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_tanhf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinhf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sinhf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_coshf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_coshf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanhf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_tanhf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastsinf8_u3500avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fastsinf8_u3500avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastcosf8_u3500avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fastcosf8_u3500avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastpowf8_u3500avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fastpowf8_u3500avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinhf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_asinhf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acoshf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_acoshf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanhf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_atanhf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp2f8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_exp2f8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp2f8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_exp2f8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp10f8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_exp10f8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp10f8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_exp10f8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_expm1f8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_expm1f8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log10f8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_log10f8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log2f8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_log2f8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log2f8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_log2f8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log1pf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_log1pf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincospif8_u05avx2(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_finz_sincospif8_u05avx2(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincospif8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_finz_sincospif8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinpif8_u05avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sinpif8_u05avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cospif8_u05avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_cospif8_u05avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmaf8_avx2(__m256, __m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fmaf8_avx2(__m256, __m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sqrtf8_avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_u05avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sqrtf8_u05avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sqrtf8_u35avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_hypotf8_u05avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_hypotf8_u05avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_hypotf8_u35avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_hypotf8_u35avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fabsf8_avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fabsf8_avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_copysignf8_avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_copysignf8_avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmaxf8_avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fmaxf8_avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fminf8_avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fminf8_avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fdimf8_avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fdimf8_avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_truncf8_avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_truncf8_avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_floorf8_avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_floorf8_avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_ceilf8_avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_ceilf8_avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_roundf8_avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_roundf8_avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_rintf8_avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_rintf8_avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_nextafterf8_avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_nextafterf8_avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_frfrexpf8_avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_frfrexpf8_avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmodf8_avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fmodf8_avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_remainderf8_avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_remainderf8_avx2(__m256, __m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_modff8_avx2(__m256); +SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_finz_modff8_avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_lgammaf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_lgammaf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tgammaf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_tgammaf8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_erff8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_erff8_u10avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_erfcf8_u15avx2(__m256); +SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_erfcf8_u15avx2(__m256); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf8_avx2(int); +SLEEF_IMPORT SLEEF_CONST int Sleef_finz_getIntf8_avx2(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf8_avx2(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_finz_getPtrf8_avx2(int); +#endif +#ifdef __SSE2__ + +#ifndef Sleef___m128d_2_DEFINED +typedef struct { + __m128d x, y; +} Sleef___m128d_2; +#define Sleef___m128d_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sind2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_sind2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cosd2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_cosd2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincosd2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_finz_sincosd2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tand2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_tand2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asind2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_asind2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acosd2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_acosd2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atand2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_atand2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atan2d2_u35avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_atan2d2_u35avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_logd2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_logd2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cbrtd2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_cbrtd2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sind2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_sind2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cosd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_cosd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincosd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_finz_sincosd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tand2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_tand2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asind2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_asind2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acosd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_acosd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atand2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_atand2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atan2d2_u10avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_atan2d2_u10avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_logd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_logd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cbrtd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_cbrtd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_expd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_expd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_powd2_u10avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_powd2_u10avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinhd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_sinhd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_coshd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_coshd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tanhd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_tanhd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinhd2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_sinhd2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_coshd2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_coshd2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tanhd2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_tanhd2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastsind2_u3500avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_fastsind2_u3500avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastcosd2_u3500avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_fastcosd2_u3500avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastpowd2_u3500avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_fastpowd2_u3500avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asinhd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_asinhd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acoshd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_acoshd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atanhd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_atanhd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp2d2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_exp2d2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp2d2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_exp2d2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp10d2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_exp10d2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp10d2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_exp10d2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_expm1d2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_expm1d2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log10d2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_log10d2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log2d2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_log2d2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log2d2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_log2d2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log1pd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_log1pd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincospid2_u05avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_finz_sincospid2_u05avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincospid2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_finz_sincospid2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinpid2_u05avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_sinpid2_u05avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cospid2_u05avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_cospid2_u05avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_ldexpd2_avx2128(__m128d, __m128i); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_ldexpd2_avx2128(__m128d, __m128i); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_ilogbd2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_finz_ilogbd2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmad2_avx2128(__m128d, __m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_fmad2_avx2128(__m128d, __m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_sqrtd2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_u05avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_sqrtd2_u05avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_sqrtd2_u35avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_hypotd2_u05avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_hypotd2_u05avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_hypotd2_u35avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_hypotd2_u35avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fabsd2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_fabsd2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_copysignd2_avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_copysignd2_avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmaxd2_avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_fmaxd2_avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmind2_avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_fmind2_avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fdimd2_avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_fdimd2_avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_truncd2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_truncd2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_floord2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_floord2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_ceild2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_ceild2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_roundd2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_roundd2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_rintd2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_rintd2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_nextafterd2_avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_nextafterd2_avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_frfrexpd2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_frfrexpd2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_expfrexpd2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128i Sleef_finz_expfrexpd2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmodd2_avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_fmodd2_avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_remainderd2_avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_remainderd2_avx2128(__m128d, __m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_modfd2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_finz_modfd2_avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_lgammad2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_lgammad2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tgammad2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_tgammad2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_erfd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_erfd2_u10avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_erfcd2_u15avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_erfcd2_u15avx2128(__m128d); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd2_avx2128(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd2_avx2128(int); + +#ifndef Sleef___m128_2_DEFINED +typedef struct { + __m128 x, y; +} Sleef___m128_2; +#define Sleef___m128_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_sinf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cosf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_cosf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincosf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_finz_sincosf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_tanf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_asinf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acosf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_acosf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_atanf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atan2f4_u35avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_atan2f4_u35avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_logf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_logf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cbrtf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_cbrtf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_sinf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cosf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_cosf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincosf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_finz_sincosf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_tanf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_asinf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acosf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_acosf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_atanf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atan2f4_u10avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_atan2f4_u10avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_logf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_logf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cbrtf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_cbrtf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_expf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_expf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_powf4_u10avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_powf4_u10avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinhf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_sinhf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_coshf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_coshf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanhf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_tanhf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinhf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_sinhf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_coshf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_coshf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanhf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_tanhf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastsinf4_u3500avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_fastsinf4_u3500avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastcosf4_u3500avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_fastcosf4_u3500avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastpowf4_u3500avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_fastpowf4_u3500avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinhf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_asinhf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acoshf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_acoshf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanhf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_atanhf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp2f4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_exp2f4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp2f4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_exp2f4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp10f4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_exp10f4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp10f4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_exp10f4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_expm1f4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_expm1f4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log10f4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_log10f4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log2f4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_log2f4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log2f4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_log2f4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log1pf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_log1pf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincospif4_u05avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_finz_sincospif4_u05avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincospif4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_finz_sincospif4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinpif4_u05avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_sinpif4_u05avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cospif4_u05avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_cospif4_u05avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmaf4_avx2128(__m128, __m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_fmaf4_avx2128(__m128, __m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_sqrtf4_avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_u05avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_sqrtf4_u05avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_sqrtf4_u35avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_hypotf4_u05avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_hypotf4_u05avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_hypotf4_u35avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_hypotf4_u35avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fabsf4_avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_fabsf4_avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_copysignf4_avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_copysignf4_avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmaxf4_avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_fmaxf4_avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fminf4_avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_fminf4_avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fdimf4_avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_fdimf4_avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_truncf4_avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_truncf4_avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_floorf4_avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_floorf4_avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_ceilf4_avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_ceilf4_avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_roundf4_avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_roundf4_avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_rintf4_avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_rintf4_avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_nextafterf4_avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_nextafterf4_avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_frfrexpf4_avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_frfrexpf4_avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmodf4_avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_fmodf4_avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_remainderf4_avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_remainderf4_avx2128(__m128, __m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_modff4_avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_finz_modff4_avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_lgammaf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_lgammaf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tgammaf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_tgammaf4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_erff4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_erff4_u10avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_erfcf4_u15avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_erfcf4_u15avx2128(__m128); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf4_avx2128(int); +SLEEF_IMPORT SLEEF_CONST int Sleef_finz_getIntf4_avx2128(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf4_avx2128(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_finz_getPtrf4_avx2128(int); +#endif +#ifdef __AVX512F__ + +#ifndef Sleef___m512d_2_DEFINED +typedef struct { + __m512d x, y; +} Sleef___m512d_2; +#define Sleef___m512d_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sind8_u35(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cosd8_u35(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincosd8_u35(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tand8_u35(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_asind8_u35(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_acosd8_u35(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atand8_u35(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atan2d8_u35(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_logd8_u35(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cbrtd8_u35(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sind8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cosd8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincosd8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tand8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_asind8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_acosd8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atand8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atan2d8_u10(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_logd8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cbrtd8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_expd8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_powd8_u10(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sinhd8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_coshd8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tanhd8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sinhd8_u35(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_coshd8_u35(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tanhd8_u35(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fastsind8_u3500(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fastcosd8_u3500(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fastpowd8_u3500(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_asinhd8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_acoshd8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atanhd8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp2d8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp2d8_u35(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp10d8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp10d8_u35(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_expm1d8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log10d8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log2d8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log2d8_u35(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log1pd8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincospid8_u05(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincospid8_u35(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sinpid8_u05(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cospid8_u05(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_ldexpd8(__m512d, __m256i); +SLEEF_IMPORT SLEEF_CONST __m256i Sleef_ilogbd8(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmad8(__m512d, __m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sqrtd8(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sqrtd8_u05(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sqrtd8_u35(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_hypotd8_u05(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_hypotd8_u35(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fabsd8(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_copysignd8(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmaxd8(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmind8(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fdimd8(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_truncd8(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_floord8(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_ceild8(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_roundd8(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_rintd8(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_nextafterd8(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_frfrexpd8(__m512d); +SLEEF_IMPORT SLEEF_CONST __m256i Sleef_expfrexpd8(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmodd8(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_remainderd8(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_modfd8(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_lgammad8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tgammad8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_erfd8_u10(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_erfcd8_u15(__m512d); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd8(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd8(int); + +#ifndef Sleef___m512_2_DEFINED +typedef struct { + __m512 x, y; +} Sleef___m512_2; +#define Sleef___m512_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinf16_u35(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cosf16_u35(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincosf16_u35(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanf16_u35(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_asinf16_u35(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_acosf16_u35(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atanf16_u35(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atan2f16_u35(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_logf16_u35(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cbrtf16_u35(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinf16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cosf16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincosf16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanf16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_asinf16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_acosf16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atanf16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atan2f16_u10(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_logf16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cbrtf16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_expf16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_powf16_u10(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinhf16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_coshf16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanhf16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinhf16_u35(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_coshf16_u35(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanhf16_u35(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fastsinf16_u3500(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fastcosf16_u3500(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fastpowf16_u3500(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_asinhf16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_acoshf16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atanhf16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp2f16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp2f16_u35(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp10f16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp10f16_u35(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_expm1f16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log10f16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log2f16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log2f16_u35(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log1pf16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincospif16_u05(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincospif16_u35(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinpif16_u05(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cospif16_u05(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fmaf16(__m512, __m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sqrtf16(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sqrtf16_u05(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sqrtf16_u35(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_hypotf16_u05(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_hypotf16_u35(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fabsf16(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_copysignf16(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fmaxf16(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fminf16(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fdimf16(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_truncf16(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_floorf16(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_ceilf16(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_roundf16(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_rintf16(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_nextafterf16(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_frfrexpf16(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fmodf16(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_remainderf16(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_modff16(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_lgammaf16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tgammaf16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_erff16_u10(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_erfcf16_u15(__m512); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf16(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf16(int); +#endif +#ifdef __AVX512F__ + +#ifndef Sleef___m512d_2_DEFINED +typedef struct { + __m512d x, y; +} Sleef___m512d_2; +#define Sleef___m512d_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sind8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_sind8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cosd8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_cosd8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincosd8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_finz_sincosd8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tand8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_tand8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_asind8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_asind8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_acosd8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_acosd8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atand8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_atand8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atan2d8_u35avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_atan2d8_u35avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_logd8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_logd8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cbrtd8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_cbrtd8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sind8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_sind8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cosd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_cosd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincosd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_finz_sincosd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tand8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_tand8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_asind8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_asind8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_acosd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_acosd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atand8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_atand8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atan2d8_u10avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_atan2d8_u10avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_logd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_logd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cbrtd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_cbrtd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_expd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_expd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_powd8_u10avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_powd8_u10avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sinhd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_sinhd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_coshd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_coshd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tanhd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_tanhd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sinhd8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_sinhd8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_coshd8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_coshd8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tanhd8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_tanhd8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fastsind8_u3500avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_fastsind8_u3500avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fastcosd8_u3500avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_fastcosd8_u3500avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fastpowd8_u3500avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_fastpowd8_u3500avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_asinhd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_asinhd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_acoshd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_acoshd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atanhd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_atanhd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp2d8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_exp2d8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp2d8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_exp2d8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp10d8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_exp10d8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp10d8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_exp10d8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_expm1d8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_expm1d8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log10d8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_log10d8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log2d8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_log2d8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log2d8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_log2d8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log1pd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_log1pd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincospid8_u05avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_finz_sincospid8_u05avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincospid8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_finz_sincospid8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sinpid8_u05avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_sinpid8_u05avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cospid8_u05avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_cospid8_u05avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_ldexpd8_avx512f(__m512d, __m256i); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_ldexpd8_avx512f(__m512d, __m256i); +SLEEF_IMPORT SLEEF_CONST __m256i Sleef_ilogbd8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m256i Sleef_finz_ilogbd8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmad8_avx512f(__m512d, __m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_fmad8_avx512f(__m512d, __m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sqrtd8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_sqrtd8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sqrtd8_u05avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_sqrtd8_u05avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sqrtd8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_sqrtd8_u35avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_hypotd8_u05avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_hypotd8_u05avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_hypotd8_u35avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_hypotd8_u35avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fabsd8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_fabsd8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_copysignd8_avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_copysignd8_avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmaxd8_avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_fmaxd8_avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmind8_avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_fmind8_avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fdimd8_avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_fdimd8_avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_truncd8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_truncd8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_floord8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_floord8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_ceild8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_ceild8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_roundd8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_roundd8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_rintd8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_rintd8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_nextafterd8_avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_nextafterd8_avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_frfrexpd8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_frfrexpd8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m256i Sleef_expfrexpd8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m256i Sleef_finz_expfrexpd8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmodd8_avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_fmodd8_avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_remainderd8_avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_remainderd8_avx512f(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_modfd8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_finz_modfd8_avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_lgammad8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_lgammad8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tgammad8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_tgammad8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_erfd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_erfd8_u10avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_erfcd8_u15avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_erfcd8_u15avx512f(__m512d); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd8_avx512f(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd8_avx512f(int); + +#ifndef Sleef___m512_2_DEFINED +typedef struct { + __m512 x, y; +} Sleef___m512_2; +#define Sleef___m512_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_sinf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cosf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_cosf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincosf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_finz_sincosf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_tanf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_asinf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_asinf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_acosf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_acosf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atanf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_atanf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atan2f16_u35avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_atan2f16_u35avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_logf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_logf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cbrtf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_cbrtf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_sinf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cosf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_cosf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincosf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_finz_sincosf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_tanf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_asinf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_asinf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_acosf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_acosf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atanf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_atanf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atan2f16_u10avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_atan2f16_u10avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_logf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_logf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cbrtf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_cbrtf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_expf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_expf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_powf16_u10avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_powf16_u10avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinhf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_sinhf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_coshf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_coshf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanhf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_tanhf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinhf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_sinhf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_coshf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_coshf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanhf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_tanhf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fastsinf16_u3500avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_fastsinf16_u3500avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fastcosf16_u3500avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_fastcosf16_u3500avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fastpowf16_u3500avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_fastpowf16_u3500avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_asinhf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_asinhf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_acoshf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_acoshf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atanhf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_atanhf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp2f16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_exp2f16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp2f16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_exp2f16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp10f16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_exp10f16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp10f16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_exp10f16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_expm1f16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_expm1f16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log10f16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_log10f16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log2f16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_log2f16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log2f16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_log2f16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log1pf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_log1pf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincospif16_u05avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_finz_sincospif16_u05avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincospif16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_finz_sincospif16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinpif16_u05avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_sinpif16_u05avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cospif16_u05avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_cospif16_u05avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fmaf16_avx512f(__m512, __m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_fmaf16_avx512f(__m512, __m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sqrtf16_avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_sqrtf16_avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sqrtf16_u05avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_sqrtf16_u05avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sqrtf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_sqrtf16_u35avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_hypotf16_u05avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_hypotf16_u05avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_hypotf16_u35avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_hypotf16_u35avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fabsf16_avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_fabsf16_avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_copysignf16_avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_copysignf16_avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fmaxf16_avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_fmaxf16_avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fminf16_avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_fminf16_avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fdimf16_avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_fdimf16_avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_truncf16_avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_truncf16_avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_floorf16_avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_floorf16_avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_ceilf16_avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_ceilf16_avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_roundf16_avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_roundf16_avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_rintf16_avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_rintf16_avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_nextafterf16_avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_nextafterf16_avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_frfrexpf16_avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_frfrexpf16_avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fmodf16_avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_fmodf16_avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_remainderf16_avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_remainderf16_avx512f(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_modff16_avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_finz_modff16_avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_lgammaf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_lgammaf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tgammaf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_tgammaf16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_erff16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_erff16_u10avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_erfcf16_u15avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_erfcf16_u15avx512f(__m512); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf16_avx512f(int); +SLEEF_IMPORT SLEEF_CONST int Sleef_finz_getIntf16_avx512f(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf16_avx512f(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_finz_getPtrf16_avx512f(int); +#endif +#ifdef __AVX512F__ + +#ifndef Sleef___m512d_2_DEFINED +typedef struct { + __m512d x, y; +} Sleef___m512d_2; +#define Sleef___m512d_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sind8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_sind8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cosd8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_cosd8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincosd8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_cinz_sincosd8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tand8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_tand8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_asind8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_asind8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_acosd8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_acosd8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atand8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_atand8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atan2d8_u35avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_atan2d8_u35avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_logd8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_logd8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cbrtd8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_cbrtd8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sind8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_sind8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cosd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_cosd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincosd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_cinz_sincosd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tand8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_tand8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_asind8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_asind8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_acosd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_acosd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atand8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_atand8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atan2d8_u10avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_atan2d8_u10avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_logd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_logd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cbrtd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_cbrtd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_expd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_expd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_powd8_u10avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_powd8_u10avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sinhd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_sinhd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_coshd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_coshd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tanhd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_tanhd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sinhd8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_sinhd8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_coshd8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_coshd8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tanhd8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_tanhd8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fastsind8_u3500avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_fastsind8_u3500avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fastcosd8_u3500avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_fastcosd8_u3500avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fastpowd8_u3500avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_fastpowd8_u3500avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_asinhd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_asinhd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_acoshd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_acoshd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atanhd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_atanhd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp2d8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_exp2d8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp2d8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_exp2d8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp10d8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_exp10d8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp10d8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_exp10d8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_expm1d8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_expm1d8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log10d8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_log10d8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log2d8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_log2d8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log2d8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_log2d8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log1pd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_log1pd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincospid8_u05avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_cinz_sincospid8_u05avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincospid8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_cinz_sincospid8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sinpid8_u05avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_sinpid8_u05avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cospid8_u05avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_cospid8_u05avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_ldexpd8_avx512fnofma(__m512d, __m256i); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_ldexpd8_avx512fnofma(__m512d, __m256i); +SLEEF_IMPORT SLEEF_CONST __m256i Sleef_ilogbd8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m256i Sleef_cinz_ilogbd8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmad8_avx512fnofma(__m512d, __m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_fmad8_avx512fnofma(__m512d, __m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sqrtd8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_sqrtd8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sqrtd8_u05avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_sqrtd8_u05avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sqrtd8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_sqrtd8_u35avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_hypotd8_u05avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_hypotd8_u05avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_hypotd8_u35avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_hypotd8_u35avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fabsd8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_fabsd8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_copysignd8_avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_copysignd8_avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmaxd8_avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_fmaxd8_avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmind8_avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_fmind8_avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fdimd8_avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_fdimd8_avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_truncd8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_truncd8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_floord8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_floord8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_ceild8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_ceild8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_roundd8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_roundd8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_rintd8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_rintd8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_nextafterd8_avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_nextafterd8_avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_frfrexpd8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_frfrexpd8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m256i Sleef_expfrexpd8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m256i Sleef_cinz_expfrexpd8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmodd8_avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_fmodd8_avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_remainderd8_avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_remainderd8_avx512fnofma(__m512d, __m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_modfd8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_cinz_modfd8_avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_lgammad8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_lgammad8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tgammad8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_tgammad8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_erfd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_erfd8_u10avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_erfcd8_u15avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_erfcd8_u15avx512fnofma(__m512d); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd8_avx512fnofma(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd8_avx512fnofma(int); + +#ifndef Sleef___m512_2_DEFINED +typedef struct { + __m512 x, y; +} Sleef___m512_2; +#define Sleef___m512_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_sinf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cosf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_cosf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincosf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_cinz_sincosf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_tanf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_asinf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_asinf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_acosf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_acosf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atanf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_atanf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atan2f16_u35avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_atan2f16_u35avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_logf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_logf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cbrtf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_cbrtf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_sinf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cosf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_cosf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincosf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_cinz_sincosf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_tanf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_asinf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_asinf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_acosf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_acosf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atanf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_atanf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atan2f16_u10avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_atan2f16_u10avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_logf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_logf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cbrtf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_cbrtf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_expf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_expf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_powf16_u10avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_powf16_u10avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinhf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_sinhf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_coshf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_coshf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanhf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_tanhf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinhf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_sinhf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_coshf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_coshf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanhf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_tanhf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fastsinf16_u3500avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_fastsinf16_u3500avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fastcosf16_u3500avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_fastcosf16_u3500avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fastpowf16_u3500avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_fastpowf16_u3500avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_asinhf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_asinhf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_acoshf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_acoshf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atanhf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_atanhf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp2f16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_exp2f16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp2f16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_exp2f16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp10f16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_exp10f16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp10f16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_exp10f16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_expm1f16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_expm1f16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log10f16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_log10f16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log2f16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_log2f16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log2f16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_log2f16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log1pf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_log1pf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincospif16_u05avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_cinz_sincospif16_u05avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincospif16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_cinz_sincospif16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinpif16_u05avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_sinpif16_u05avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cospif16_u05avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_cospif16_u05avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fmaf16_avx512fnofma(__m512, __m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_fmaf16_avx512fnofma(__m512, __m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sqrtf16_avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_sqrtf16_avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sqrtf16_u05avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_sqrtf16_u05avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sqrtf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_sqrtf16_u35avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_hypotf16_u05avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_hypotf16_u05avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_hypotf16_u35avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_hypotf16_u35avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fabsf16_avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_fabsf16_avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_copysignf16_avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_copysignf16_avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fmaxf16_avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_fmaxf16_avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fminf16_avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_fminf16_avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fdimf16_avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_fdimf16_avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_truncf16_avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_truncf16_avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_floorf16_avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_floorf16_avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_ceilf16_avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_ceilf16_avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_roundf16_avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_roundf16_avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_rintf16_avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_rintf16_avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_nextafterf16_avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_nextafterf16_avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_frfrexpf16_avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_frfrexpf16_avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fmodf16_avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_fmodf16_avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_remainderf16_avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_remainderf16_avx512fnofma(__m512, __m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_modff16_avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_cinz_modff16_avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_lgammaf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_lgammaf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tgammaf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_tgammaf16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_erff16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_erff16_u10avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_erfcf16_u15avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_erfcf16_u15avx512fnofma(__m512); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf16_avx512fnofma(int); +SLEEF_IMPORT SLEEF_CONST int Sleef_cinz_getIntf16_avx512fnofma(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf16_avx512fnofma(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_cinz_getPtrf16_avx512fnofma(int); +#endif +#ifdef __STDC__ + +#ifndef Sleef_double_2_DEFINED +typedef Sleef_double2 Sleef_double_2; +#define Sleef_double_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST double Sleef_sind1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_sind1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cosd1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_cosd1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincosd1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_cinz_sincosd1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_tand1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_tand1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_asind1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_asind1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_acosd1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_acosd1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_atand1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_atand1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_atan2d1_u35purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_atan2d1_u35purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_logd1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_logd1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cbrtd1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_cbrtd1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_sind1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_sind1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cosd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_cosd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincosd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_cinz_sincosd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_tand1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_tand1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_asind1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_asind1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_acosd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_acosd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_atand1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_atand1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_atan2d1_u10purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_atan2d1_u10purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_logd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_logd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cbrtd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_cbrtd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_expd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_expd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_powd1_u10purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_powd1_u10purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_sinhd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_sinhd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_coshd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_coshd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_tanhd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_tanhd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_sinhd1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_sinhd1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_coshd1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_coshd1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_tanhd1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_tanhd1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_fastsind1_u3500purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_fastsind1_u3500purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_fastcosd1_u3500purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_fastcosd1_u3500purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_fastpowd1_u3500purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_fastpowd1_u3500purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_asinhd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_asinhd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_acoshd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_acoshd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_atanhd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_atanhd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_exp2d1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_exp2d1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_exp2d1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_exp2d1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_exp10d1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_exp10d1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_exp10d1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_exp10d1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_expm1d1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_expm1d1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_log10d1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_log10d1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_log2d1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_log2d1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_log2d1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_log2d1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_log1pd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_log1pd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincospid1_u05purec(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_cinz_sincospid1_u05purec(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincospid1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_cinz_sincospid1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_sinpid1_u05purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_sinpid1_u05purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cospid1_u05purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_cospid1_u05purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_ldexpd1_purec(double, int32_t); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_ldexpd1_purec(double, int32_t); +SLEEF_IMPORT SLEEF_CONST int32_t Sleef_ilogbd1_purec(double); +SLEEF_IMPORT SLEEF_CONST int32_t Sleef_cinz_ilogbd1_purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_fmad1_purec(double, double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_fmad1_purec(double, double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_sqrtd1_purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_sqrtd1_purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_sqrtd1_u05purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_sqrtd1_u05purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_sqrtd1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_sqrtd1_u35purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_hypotd1_u05purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_hypotd1_u05purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_hypotd1_u35purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_hypotd1_u35purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_fabsd1_purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_fabsd1_purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_copysignd1_purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_copysignd1_purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_fmaxd1_purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_fmaxd1_purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_fmind1_purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_fmind1_purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_fdimd1_purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_fdimd1_purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_truncd1_purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_truncd1_purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_floord1_purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_floord1_purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_ceild1_purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_ceild1_purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_roundd1_purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_roundd1_purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_rintd1_purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_rintd1_purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_nextafterd1_purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_nextafterd1_purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_frfrexpd1_purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_frfrexpd1_purec(double); +SLEEF_IMPORT SLEEF_CONST int32_t Sleef_expfrexpd1_purec(double); +SLEEF_IMPORT SLEEF_CONST int32_t Sleef_cinz_expfrexpd1_purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_fmodd1_purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_fmodd1_purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_remainderd1_purec(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_remainderd1_purec(double, double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_modfd1_purec(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_cinz_modfd1_purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_lgammad1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_lgammad1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_tgammad1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_tgammad1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_erfd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_erfd1_u10purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_erfcd1_u15purec(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_erfcd1_u15purec(double); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd1_purec(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd1_purec(int); + +#ifndef Sleef_float_2_DEFINED +typedef Sleef_float2 Sleef_float_2; +#define Sleef_float_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST float Sleef_sinf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_sinf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cosf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_cosf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincosf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_cinz_sincosf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_tanf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_tanf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_asinf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_asinf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_acosf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_acosf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_atanf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_atanf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_atan2f1_u35purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_atan2f1_u35purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_logf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_logf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cbrtf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_cbrtf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_sinf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_sinf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cosf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_cosf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincosf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_cinz_sincosf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_tanf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_tanf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_asinf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_asinf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_acosf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_acosf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_atanf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_atanf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_atan2f1_u10purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_atan2f1_u10purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_logf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_logf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cbrtf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_cbrtf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_expf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_expf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_powf1_u10purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_powf1_u10purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_sinhf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_sinhf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_coshf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_coshf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_tanhf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_tanhf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_sinhf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_sinhf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_coshf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_coshf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_tanhf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_tanhf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_fastsinf1_u3500purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_fastsinf1_u3500purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_fastcosf1_u3500purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_fastcosf1_u3500purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_fastpowf1_u3500purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_fastpowf1_u3500purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_asinhf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_asinhf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_acoshf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_acoshf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_atanhf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_atanhf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_exp2f1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_exp2f1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_exp2f1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_exp2f1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_exp10f1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_exp10f1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_exp10f1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_exp10f1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_expm1f1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_expm1f1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_log10f1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_log10f1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_log2f1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_log2f1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_log2f1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_log2f1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_log1pf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_log1pf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincospif1_u05purec(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_cinz_sincospif1_u05purec(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincospif1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_cinz_sincospif1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_sinpif1_u05purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_sinpif1_u05purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cospif1_u05purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_cospif1_u05purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_fmaf1_purec(float, float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_fmaf1_purec(float, float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf1_purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_sqrtf1_purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf1_u05purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_sqrtf1_u05purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_sqrtf1_u35purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_hypotf1_u05purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_hypotf1_u05purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_hypotf1_u35purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_hypotf1_u35purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_fabsf1_purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_fabsf1_purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_copysignf1_purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_copysignf1_purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_fmaxf1_purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_fmaxf1_purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_fminf1_purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_fminf1_purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_fdimf1_purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_fdimf1_purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_truncf1_purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_truncf1_purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_floorf1_purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_floorf1_purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_ceilf1_purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_ceilf1_purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_roundf1_purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_roundf1_purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_rintf1_purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_rintf1_purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_nextafterf1_purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_nextafterf1_purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_frfrexpf1_purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_frfrexpf1_purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_fmodf1_purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_fmodf1_purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_remainderf1_purec(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_remainderf1_purec(float, float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_modff1_purec(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_cinz_modff1_purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_lgammaf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_lgammaf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_tgammaf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_tgammaf1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_erff1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_erff1_u10purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_erfcf1_u15purec(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_erfcf1_u15purec(float); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf1_purec(int); +SLEEF_IMPORT SLEEF_CONST int Sleef_cinz_getIntf1_purec(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf1_purec(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_cinz_getPtrf1_purec(int); +#endif +#ifdef __STDC__ + +#ifndef Sleef_double_2_DEFINED +typedef Sleef_double2 Sleef_double_2; +#define Sleef_double_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST double Sleef_sind1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_sind1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cosd1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_cosd1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincosd1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_finz_sincosd1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_tand1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_tand1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_asind1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_asind1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_acosd1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_acosd1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_atand1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_atand1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_atan2d1_u35purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_atan2d1_u35purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_logd1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_logd1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cbrtd1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_cbrtd1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_sind1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_sind1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cosd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_cosd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincosd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_finz_sincosd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_tand1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_tand1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_asind1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_asind1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_acosd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_acosd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_atand1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_atand1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_atan2d1_u10purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_atan2d1_u10purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_logd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_logd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cbrtd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_cbrtd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_expd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_expd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_powd1_u10purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_powd1_u10purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_sinhd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_sinhd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_coshd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_coshd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_tanhd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_tanhd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_sinhd1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_sinhd1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_coshd1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_coshd1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_tanhd1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_tanhd1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_fastsind1_u3500purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_fastsind1_u3500purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_fastcosd1_u3500purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_fastcosd1_u3500purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_fastpowd1_u3500purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_fastpowd1_u3500purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_asinhd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_asinhd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_acoshd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_acoshd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_atanhd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_atanhd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_exp2d1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_exp2d1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_exp2d1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_exp2d1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_exp10d1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_exp10d1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_exp10d1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_exp10d1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_expm1d1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_expm1d1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_log10d1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_log10d1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_log2d1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_log2d1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_log2d1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_log2d1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_log1pd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_log1pd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincospid1_u05purecfma(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_finz_sincospid1_u05purecfma(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincospid1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_finz_sincospid1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_sinpid1_u05purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_sinpid1_u05purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_cospid1_u05purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_cospid1_u05purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_ldexpd1_purecfma(double, int32_t); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_ldexpd1_purecfma(double, int32_t); +SLEEF_IMPORT SLEEF_CONST int32_t Sleef_ilogbd1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST int32_t Sleef_finz_ilogbd1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_fmad1_purecfma(double, double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_fmad1_purecfma(double, double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_sqrtd1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_sqrtd1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_sqrtd1_u05purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_sqrtd1_u05purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_sqrtd1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_sqrtd1_u35purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_hypotd1_u05purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_hypotd1_u05purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_hypotd1_u35purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_hypotd1_u35purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_fabsd1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_fabsd1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_copysignd1_purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_copysignd1_purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_fmaxd1_purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_fmaxd1_purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_fmind1_purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_fmind1_purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_fdimd1_purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_fdimd1_purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_truncd1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_truncd1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_floord1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_floord1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_ceild1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_ceild1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_roundd1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_roundd1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_rintd1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_rintd1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_nextafterd1_purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_nextafterd1_purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_frfrexpd1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_frfrexpd1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST int32_t Sleef_expfrexpd1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST int32_t Sleef_finz_expfrexpd1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_fmodd1_purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_fmodd1_purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_remainderd1_purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_remainderd1_purecfma(double, double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_modfd1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_finz_modfd1_purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_lgammad1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_lgammad1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_tgammad1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_tgammad1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_erfd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_erfd1_u10purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_erfcd1_u15purecfma(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_finz_erfcd1_u15purecfma(double); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd1_purecfma(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd1_purecfma(int); + +#ifndef Sleef_float_2_DEFINED +typedef Sleef_float2 Sleef_float_2; +#define Sleef_float_2_DEFINED +#endif + +SLEEF_IMPORT SLEEF_CONST float Sleef_sinf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_sinf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cosf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_cosf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincosf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_finz_sincosf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_tanf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_tanf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_asinf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_asinf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_acosf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_acosf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_atanf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_atanf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_atan2f1_u35purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_atan2f1_u35purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_logf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_logf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cbrtf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_cbrtf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_sinf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_sinf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cosf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_cosf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincosf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_finz_sincosf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_tanf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_tanf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_asinf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_asinf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_acosf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_acosf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_atanf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_atanf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_atan2f1_u10purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_atan2f1_u10purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_logf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_logf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cbrtf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_cbrtf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_expf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_expf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_powf1_u10purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_powf1_u10purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_sinhf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_sinhf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_coshf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_coshf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_tanhf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_tanhf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_sinhf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_sinhf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_coshf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_coshf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_tanhf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_tanhf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_fastsinf1_u3500purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_fastsinf1_u3500purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_fastcosf1_u3500purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_fastcosf1_u3500purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_fastpowf1_u3500purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_fastpowf1_u3500purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_asinhf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_asinhf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_acoshf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_acoshf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_atanhf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_atanhf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_exp2f1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_exp2f1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_exp2f1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_exp2f1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_exp10f1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_exp10f1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_exp10f1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_exp10f1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_expm1f1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_expm1f1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_log10f1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_log10f1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_log2f1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_log2f1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_log2f1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_log2f1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_log1pf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_log1pf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincospif1_u05purecfma(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_finz_sincospif1_u05purecfma(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincospif1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_finz_sincospif1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_sinpif1_u05purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_sinpif1_u05purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_cospif1_u05purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_cospif1_u05purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_fmaf1_purecfma(float, float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_fmaf1_purecfma(float, float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf1_purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_sqrtf1_purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf1_u05purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_sqrtf1_u05purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_sqrtf1_u35purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_hypotf1_u05purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_hypotf1_u05purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_hypotf1_u35purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_hypotf1_u35purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_fabsf1_purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_fabsf1_purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_copysignf1_purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_copysignf1_purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_fmaxf1_purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_fmaxf1_purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_fminf1_purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_fminf1_purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_fdimf1_purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_fdimf1_purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_truncf1_purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_truncf1_purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_floorf1_purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_floorf1_purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_ceilf1_purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_ceilf1_purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_roundf1_purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_roundf1_purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_rintf1_purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_rintf1_purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_nextafterf1_purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_nextafterf1_purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_frfrexpf1_purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_frfrexpf1_purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_fmodf1_purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_fmodf1_purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_remainderf1_purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_remainderf1_purecfma(float, float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_modff1_purecfma(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_finz_modff1_purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_lgammaf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_lgammaf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_tgammaf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_tgammaf1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_erff1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_erff1_u10purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_erfcf1_u15purecfma(float); +SLEEF_IMPORT SLEEF_CONST float Sleef_finz_erfcf1_u15purecfma(float); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf1_purecfma(int); +SLEEF_IMPORT SLEEF_CONST int Sleef_finz_getIntf1_purecfma(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf1_purecfma(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_finz_getPtrf1_purecfma(int); +#endif +#ifdef __STDC__ + +#ifndef Sleef_double_2_DEFINED +typedef Sleef_double2 Sleef_double_2; +#define Sleef_double_2_DEFINED +#endif + +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sind1_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cosd1_u35(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincosd1_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_tand1_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_asind1_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_acosd1_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_atand1_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_atan2d1_u35(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_logd1_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cbrtd1_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sind1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cosd1_u10(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincosd1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_tand1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_asind1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_acosd1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_atand1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_atan2d1_u10(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_logd1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cbrtd1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_expd1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_powd1_u10(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sinhd1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_coshd1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_tanhd1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sinhd1_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_coshd1_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_tanhd1_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fastsind1_u3500(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fastcosd1_u3500(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fastpowd1_u3500(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_asinhd1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_acoshd1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_atanhd1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_exp2d1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_exp2d1_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_exp10d1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_exp10d1_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_expm1d1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_log10d1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_log2d1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_log2d1_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_log1pd1_u10(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincospid1_u05(double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincospid1_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sinpid1_u05(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cospid1_u05(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_ldexpd1(double, int32_t); +SLEEF_IMPORT SLEEF_CONST int32_t Sleef_ilogbd1(double); +SLEEF_IMPORT SLEEF_CONST double Sleef_fmad1(double, double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sqrtd1(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sqrtd1_u05(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sqrtd1_u35(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_hypotd1_u05(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_hypotd1_u35(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fabsd1(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_copysignd1(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fmaxd1(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fmind1(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fdimd1(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_truncd1(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_floord1(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_ceild1(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_roundd1(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_rintd1(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_nextafterd1(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_frfrexpd1(double); +SLEEF_IMPORT SLEEF_CONST int32_t Sleef_expfrexpd1(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fmodd1(double, double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_remainderd1(double, double); +SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_modfd1(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_lgammad1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_tgammad1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_erfd1_u10(double); +SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_erfcd1_u15(double); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd1(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd1(int); + +#ifndef Sleef_float_2_DEFINED +typedef Sleef_float2 Sleef_float_2; +#define Sleef_float_2_DEFINED +#endif + +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sinf1_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_cosf1_u35(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincosf1_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_tanf1_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_asinf1_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_acosf1_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_atanf1_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_atan2f1_u35(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_logf1_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_cbrtf1_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sinf1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_cosf1_u10(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincosf1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_tanf1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_asinf1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_acosf1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_atanf1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_atan2f1_u10(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_logf1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_cbrtf1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_expf1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_powf1_u10(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sinhf1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_coshf1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_tanhf1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sinhf1_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_coshf1_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_tanhf1_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fastsinf1_u3500(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fastcosf1_u3500(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fastpowf1_u3500(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_asinhf1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_acoshf1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_atanhf1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_exp2f1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_exp2f1_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_exp10f1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_exp10f1_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_expm1f1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_log10f1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_log2f1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_log2f1_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_log1pf1_u10(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincospif1_u05(float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincospif1_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sinpif1_u05(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_cospif1_u05(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fmaf1(float, float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf1(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf1_u05(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf1_u35(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_hypotf1_u05(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_hypotf1_u35(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fabsf1(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_copysignf1(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fmaxf1(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fminf1(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fdimf1(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_truncf1(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_floorf1(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_ceilf1(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_roundf1(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_rintf1(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_nextafterf1(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_frfrexpf1(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fmodf1(float, float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_remainderf1(float, float); +SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_modff1(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_lgammaf1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_tgammaf1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_erff1_u10(float); +SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_erfcf1_u15(float); +SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf1(int); +SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf1(int); +#endif + +// + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // #ifndef __SLEEF_H__ diff --git a/.venv/lib/python3.11/site-packages/torch/include/xnnpack.h b/.venv/lib/python3.11/site-packages/torch/include/xnnpack.h new file mode 100644 index 0000000000000000000000000000000000000000..e71be0fd57ffc1ef2cc67b2fc8fb20fc4288a1d2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/xnnpack.h @@ -0,0 +1,6172 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// All rights reserved. +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include +#include + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/// The number of bytes XNNPACK may read beyond array bounds. +/// The caller must allocate at least this many extra bytes after the tensor data passed to XNNPACK. +/// +/// Note: XNNPACK reads, but never writes beyond array bounds. +#define XNN_EXTRA_BYTES 16 + +/// Maximum number of dimensions in tensor shape. +#define XNN_MAX_TENSOR_DIMS 6 + +/// Allow sparse inference in a Runtime. +/// +/// Note: this flag hints XNNPACK to consider sparse inference, but does not guarantee it. +#define XNN_FLAG_HINT_SPARSE_INFERENCE 0x00000001 + +/// Allow IEEE FP16 inference in a Runtime. +/// +/// Note: this flag hints XNNPACK to consider IEEE FP16 inference, but does not guarantee it. +#define XNN_FLAG_HINT_FP16_INFERENCE 0x00000002 + +/// Force IEEE FP16 inference in a Runtime, and fail if FP16 inference is not possible. +/// +/// Note: this flag guarantees that XNNPACK will use IEEE FP16 inference, or fail to create the Runtime object. +/// Warning: on x86 systems FP16 computations will be emulated at a substantial performance cost. +#define XNN_FLAG_FORCE_FP16_INFERENCE 0x00000004 + +/// Enable timing of each operator's runtime. +#define XNN_FLAG_BASIC_PROFILING 0x00000008 + +/// Enable the just-in-time compiler. +#define XNN_FLAG_JIT 0x00000010 + +/// The convolution operator represents a depthwise convolution, and use HWGo layout for filters. +#define XNN_FLAG_DEPTHWISE_CONVOLUTION 0x00000001 + +/// Assume transposed weights in a fully connected operator. +#define XNN_FLAG_TRANSPOSE_WEIGHTS 0x00000001 + +/// The operator assumes NHWC layout for the input, regardless of the output layout. +#define XNN_FLAG_INPUT_NHWC 0x00000002 + +/// Match "SAME" padding in TensorFlow. Exact padding values are computed dynamically depending on input size. +#define XNN_FLAG_TENSORFLOW_SAME_PADDING 0x00000004 + +/// Assume transposed weights in a batch matrix multiply operator. +#define XNN_FLAG_TRANSPOSE_B XNN_FLAG_TRANSPOSE_WEIGHTS + +/// Assume transposed input in a batch matrix multiply operator. +#define XNN_FLAG_TRANSPOSE_A 0x00000002 + +/// Implicitly flatten and reshape input of a Fully Connected operator into a 2D tensor. +#define XNN_FLAG_TENSORFLOW_RESHAPE_2D 0x00000004 + +/// Match behaviour of TensorFlow 1.x. +#define XNN_FLAG_TENSORFLOW_LEGACY_MODE 0x00000004 + +/// Static weights of the FP16 operator are in FP32 format. +#define XNN_FLAG_FP32_STATIC_WEIGHTS 0x00000008 + +/// Align corners of input and output images in resize operations. +#define XNN_FLAG_ALIGN_CORNERS 0x00000008 + +/// Yield worker threads of the thread pool to the system scheduler after the inference. +#define XNN_FLAG_YIELD_WORKERS 0x00000010 + +/// Use transient indirection buffer to reduce memory footprint +#define XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER 0x00000020 + +/// Reduce the dimensions. +#define XNN_FLAG_REDUCE_DIMS 0x00000040 + +/// The number of entries in an array of xnn_dynamic_quantization_params that XNNPACK may read beyond array bounds. +/// The caller must allocate at least this many extra xnn_dynamic_quantization_params before passing the array to XNNPACK. +/// +/// Note: XNNPACK reads, but never writes beyond array bounds. +#define XNN_EXTRA_QUANTIZATION_PARAMS 8 + +struct xnn_dynamic_quantization_params { + int32_t zero_point; + float scale; +}; + +/// Status code for any XNNPACK function call. +enum xnn_status { + /// The call succeeded, and all output arguments now contain valid data. + xnn_status_success = 0, + xnn_status_uninitialized = 1, + xnn_status_invalid_parameter = 2, + xnn_status_invalid_state = 3, + xnn_status_unsupported_parameter = 4, + xnn_status_unsupported_hardware = 5, + xnn_status_out_of_memory = 6, + xnn_status_reallocation_required = 7, +}; + +struct xnn_allocator { + /// User-specified pointer that will be passed as-is to all functions in this structure. + void* context; + /// Pointer to a function to be called for general memory allocation. + /// + /// @param context - The user-specified pointer from xnn_allocator structure. + /// @param size - The size of the memory block to allocate, in bytes. + /// + /// @returns Pointer to the allocated memory block of at least @ref size bytes. + /// If allocation fails, the function must return NULL. + void* (*allocate)(void* context, size_t size); + /// Pointer to a function to be called for general memory re-allocation, i.e. to increase or shrink a previously + /// allocated memory block. The content of the old memory block is copied to the new memory block. + /// + /// @param context - The user-specified pointer from xnn_allocator structure. + /// @param pointer - Pointer to a memory block allocated by @ref allocate or @ref reallocate functions. Can be NULL. + /// If the pointer is NULL, the @ref reallocate call is equivalent to an @ref allocate call. + /// @param size - The new size of the memory block to allocate, in bytes. + /// + /// @returns Pointer to the newly allocated memory block of at least @ref size bytes with the content of the previous + /// memory block. + /// If allocation fails, the function must return NULL, but must not release the previous memory block. + void* (*reallocate)(void* context, void* pointer, size_t size); + /// Pointer to a function to be called for general memory de-allocation. + /// + /// @param context - The user-specified pointer from xnn_allocator structure. + /// @param pointer - Pointer to a memory block allocated by @ref allocate or @ref reallocate functions. Can be NULL. + /// If the pointer is NULL, the @ref deallocate call is a no-op. + void (*deallocate)(void* context, void* pointer); + /// Pointer to a function to be called for aligned memory allocation. + /// + /// @param context - The user-specified pointer from xnn_allocator structure. + /// @param alignment - The alignment of the memory block to allocate, in bytes. Alignment is always a power-of-2. + /// @param size - The size of the memory block to allocate, in bytes. + /// + /// @returns Pointer to the allocated memory block of at least @ref size bytes. + /// If allocation fails, the function must return NULL. + void* (*aligned_allocate)(void* context, size_t alignment, size_t size); + /// Pointer to a function to be called for aligned memory de-allocation. + /// + /// @param context - The user-specified pointer from xnn_allocator structure. + /// @param pointer - Pointer to a memory block allocated by @ref aligned_allocate function. Can be NULL. + /// If the pointer is NULL, the @ref aligned_deallocate call is a no-op. + void (*aligned_deallocate)(void* context, void* pointer); +}; + +/// Initialize XNNPACK library. +/// +/// XNNPACK must be successfully initialized before use. During initialization, XNNPACK populates internal structures +/// depending on the host processor. Initialization can be time-consuming. +/// +/// @param[in] allocator - structure with function pointers to be use for memory allocation and de-allocation. +/// If this argument is NULL, system-provided memory management functions (e.g. malloc/free) +/// will be used. +/// +/// @retval xnn_status_success - XNNPACK is successfully initialized and ready to use. +/// @retval xnn_status_out_of_memory - initialization failed due to out-of-memory condition. +/// @retval xnn_status_unsupported_hardware - initialization failed because the host processor does not satisfy the +/// minimum hardware requirements for XNNPACK. E.g. this may happen on x86 +/// processors without SSE2 extension, or on 32-bit ARM processors without +/// the NEON SIMD extension. +enum xnn_status xnn_initialize(const struct xnn_allocator* allocator); + +/// Deinitialize XNNPACK library. +/// +/// To avoid memory and resource leaks, users must call xnn_deinitialize once for each successful xnn_initialize call. +/// +/// @retval xnn_status_success - deinitialization call succeeded. +enum xnn_status xnn_deinitialize(void); + +/// Subgraph is an abstract representation of a neural network model. +/// Subgraph objects are used to define Values (tensors) and Nodes (operators) comprising the model. +typedef struct xnn_subgraph* xnn_subgraph_t; + +/// Create a empty Subgraph object. +/// +/// @param external_value_ids - number of Value IDs to reserve for communication with external graph representation. +/// The Subgraph object would avoid creating internal Value IDs in the +/// [0, reserved_value_ids-1] range. +/// @param flags - binary features of the subgraph. No supported flags are currently defined. +/// @param subgraph_out - pointer to the variable that will be initialized with a handle to the Subgraph object upon +/// successful return. +enum xnn_status xnn_create_subgraph( + uint32_t external_value_ids, + uint32_t flags, + xnn_subgraph_t* subgraph_out); + +/// Destroy a Subgraph object, as well as Values, and Nodes associated with the subgraph. +/// +/// @param subgraph - the Subgraph object to destroy. +enum xnn_status xnn_delete_subgraph( + xnn_subgraph_t subgraph); + +#define XNN_VALUE_FLAG_EXTERNAL_INPUT 0x00000001 +#define XNN_VALUE_FLAG_EXTERNAL_OUTPUT 0x00000002 +#define XNN_VALUE_FLAG_PERSISTENT 0x00000004 + +#define XNN_INVALID_VALUE_ID UINT32_MAX + +/// Type of elements in a Value object. +enum xnn_datatype { + /// Invalid data type. Valid Values never have this datatype. + xnn_datatype_invalid = 0, + /// IEEE754 single-precision floating-point. + xnn_datatype_fp32 = 1, + /// IEEE754 half-precision floating-point. + xnn_datatype_fp16 = 2, + /// Quantized 8-bit signed integer with shared per-Value quantization parameters. + xnn_datatype_qint8 = 3, + /// Quantized 8-bit unsigned integer with shared per-Value quantization parameters. + xnn_datatype_quint8 = 4, + /// Quantized 32-bit signed integer with shared per-Value quantization parameters. + xnn_datatype_qint32 = 5, + /// Quantized 8-bit signed integer with shared per-channel quantization parameters. + xnn_datatype_qcint8 = 6, + /// Quantized 32-bit signed integer with shared per-channel quantization parameters. + xnn_datatype_qcint32 = 7, + /// Quantized 4-bit signed integer with shared per-channel quantization parameters. + xnn_datatype_qcint4 = 8, + /// Dynamically quantized 8-bit signed integer with per-batch quantization parameters. + xnn_datatype_qdint8 = 9, +}; + +/// Define a tensor-type Value and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Value. +/// @param datatype - type of the tensor elements. +/// @param num_dims - number of dimensions in the shape. +/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL. +/// XNNPACK does not keep any pointers to this array after the function returns. +/// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized, +/// this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time +/// of the Subgraph object, and of any Runtime objects created from the Subgraph. +/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on +/// the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be +/// created for the Value. +/// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT +/// and XNN_VALUE_FLAG_EXTERNAL_OUTPUT. +/// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a +/// valid @a external_id was provided, the variable will be initialized with the @a external_id value. +enum xnn_status xnn_define_tensor_value( + xnn_subgraph_t subgraph, + enum xnn_datatype datatype, + size_t num_dims, + const size_t* dims, + const void* data, + uint32_t external_id, + uint32_t flags, + uint32_t* id_out); + +/// Define a quantized tensor-type Value and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Value. +/// @param datatype - type of the tensor elements. +/// @param zero_point - offset from zero to subtract from the quantized elements in the Value. +/// @param scale - multiplication factor to convert quantized elements to real representation. +/// @param num_dims - number of dimensions in the shape. +/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL. +/// XNNPACK does not keep any pointers to this array after the function returns. +/// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized, +/// this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time +/// of the Subgraph object, and of any Runtime objects created from the Subgraph. +/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on +/// the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be +/// created for the Value. +/// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT +/// and XNN_VALUE_FLAG_EXTERNAL_OUTPUT. +/// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a +/// valid @a external_id was provided, the variable will be initialized with the @a external_id value. +enum xnn_status xnn_define_quantized_tensor_value( + xnn_subgraph_t subgraph, + enum xnn_datatype datatype, + int32_t zero_point, + float scale, + size_t num_dims, + const size_t* dims, + const void* data, + uint32_t external_id, + uint32_t flags, + uint32_t* id_out); + +enum xnn_status xnn_define_channelwise_quantized_tensor_value( + xnn_subgraph_t subgraph, + enum xnn_datatype datatype, + const float* scale, + size_t num_dims, + size_t channel_dim, + const size_t* dims, + const void* data, + uint32_t external_id, + uint32_t flags, + uint32_t* id_out); + +/// Validate the dimensions, channel_dim, zero point, datatype, and scale of a quantized tensor-type. +/// +/// @param datatype - type of the tensor elements. +/// @param zero_point - offset from zero to subtract from the quantized elements in the Value. +/// @param scale - multiplication factor to convert quantized elements to real representation. +/// @param num_dims - number of dimensions in the shape. +/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL. +/// XNNPACK does not keep any pointers to this array after the function returns. +enum xnn_status xnn_validate_quantized_tensor( + enum xnn_datatype datatype, + int32_t zero_point, + float scale, + size_t num_dims, + const size_t* dims); + +/// Validate the dimensions, channel_dim, zero point, datatype, and scales of a channelwise quantized tensor-type. +/// +/// @param datatype - type of the tensor elements. +/// @param zero_point - offset from zero to subtract from the quantized elements in the Value. +/// @param scale - per-channel multiplication factors to convert quantized elements to real representation. +/// @param num_dims - number of dimensions in the shape. +/// @param channel_dim - index of the channel dimension in the tensor with per-channel quantization parameters. +/// Typically this is the first dimension (dimension #0) of the filter tensors in the Convolution, +/// Deconvolution, and Fully Connected operators and the last dimension of the filter tensors in +/// the Depthwise Convolution operators. +/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL. +/// XNNPACK does not keep any pointers to this array after the function returns. +enum xnn_status xnn_validate_channelwise_quantized_tensor( + enum xnn_datatype datatype, + int32_t zero_point, + const float* scale, + size_t num_dims, + size_t channel_dim, + const size_t* dims); + +/// Define a channelwise quantized tensor-type Value and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Value. +/// @param datatype - type of the tensor elements. +/// @param zero_point - offset from zero to subtract from the quantized elements in the Value. +/// @param scale - per-channel multiplication factors to convert quantized elements to real representation. +/// @param num_dims - number of dimensions in the shape. +/// @param channel_dim - index of the channel dimension in the tensor with per-channel quantization parameters. +/// Typically this is the first dimension (dimension #0) of the filter tensors in the Convolution, +/// Deconvolution, and Fully Connected operators and the last dimension of the filter tensors in +/// the Depthwise Convolution operators. +/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL. +/// XNNPACK does not keep any pointers to this array after the function returns. +/// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized, +/// this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time +/// of the Subgraph object, and of any Runtime objects created from the Subgraph. +/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on +/// the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be +/// created for the Value. +/// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT +/// and XNN_VALUE_FLAG_EXTERNAL_OUTPUT. +/// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a +/// valid @a external_id was provided, the variable will be initialized with the @a external_id value. +enum xnn_status xnn_define_channelwise_quantized_tensor_value_v2( + xnn_subgraph_t subgraph, + enum xnn_datatype datatype, + int32_t zero_point, + const float* scale, + size_t num_dims, + size_t channel_dim, + const size_t* dims, + const void* data, + uint32_t external_id, + uint32_t flags, + uint32_t* id_out); + +/// Define a dynamically quantized tensor-type Value and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Value. +/// @param datatype - type of the tensor elements. +/// @param num_dims - number of dimensions in the shape. +/// @param num_non_batch_dims - number of non-batch dimensions in the shape. The leading (num_dims - num_non_batch_dims) +/// dimensions will be flattened and treated as batch size. A set of quantization parameters +/// will be calculated for each batch element. +/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL. +/// XNNPACK does not keep any pointers to this array after the function returns. +/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on +/// the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be +/// created for the Value. +/// @param flags - binary features of the Value. No supported flags are currently defined. +/// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a +/// valid @a external_id was provided, the variable will be initialized with the @a external_id value. +enum xnn_status xnn_define_dynamically_quantized_tensor_value( + xnn_subgraph_t subgraph, + enum xnn_datatype datatype, + size_t num_dims, + size_t num_nonbatch_dims, + const size_t* dims, + uint32_t external_id, + uint32_t flags, + uint32_t* id_out); + +/// Define a Convert Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its +/// shape must match the shape of the input tensor. +/// @param flags - binary features of the Convert Node. No supported flags are currently defined. +enum xnn_status xnn_define_convert( + xnn_subgraph_t subgraph, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a 2D Convolution Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING +/// flag is specified. +/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if +/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified. +/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if +/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified. +/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if +/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified. +/// @param kernel_height - kernel (filter) height. +/// @param kernel_width - kernel (filter) width. +/// @param subsampling_height - height of subsampling region for convolution output (convolution height stride). +/// @param subsampling_width - width of subsampling region for convolution output (convolution width stride). +/// @param dilation_height - dilation of kernel elements along the height dimension. +/// @param dilation_width - dilation of kernel elements along the width dimension. +/// @param groups - number of convolution groups. +/// @param group_input_channels - number of input channels per group. +/// @param group_output_channels - number of output channels per group. +/// @param output_min - lower bound for clipping output values. +/// @param output_max - upper bound for clipping output values. +/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph +/// with [N, IH, IW, groups * group_input_channels] dimensions +/// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph +/// with [groups * group_output_channels, kernel_height, kernel_width, group_input_channels] +/// dimensions. +/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a 2D Convolution Node without a bias. If +/// present, the bias tensor must be a 1D tensor defined in the @a subgraph with [groups * +/// group_output_channels] dimensions. +/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph +/// with [N, OH, OW, groups * group_output_channels] dimensions. +/// @param flags - binary features of the 2D Convolution Node. The only currently supported values is +/// XNN_FLAG_TENSORFLOW_SAME_PADDING. +enum xnn_status xnn_define_convolution_2d( + xnn_subgraph_t subgraph, + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t kernel_height, + uint32_t kernel_width, + uint32_t subsampling_height, + uint32_t subsampling_width, + uint32_t dilation_height, + uint32_t dilation_width, + uint32_t groups, + size_t group_input_channels, + size_t group_output_channels, + float output_min, + float output_max, + uint32_t input_id, + uint32_t filter_id, + uint32_t bias_id, + uint32_t output_id, + uint32_t flags); + +/// Define a 2D Deconvolution (Transposed Convolution) Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param padding_top - implicit padding above 2D output data. +/// @param padding_right - implicit padding to the right of 2D output data. +/// @param padding_bottom - implicit padding below 2D output data. +/// @param padding_left - implicit padding to the left of 2D output data. +/// @param adjustment_height - additional elements in the bottom of the 2D output data. +/// @param adjustment_width - additional elements to the right of the 2D output data. +/// @param kernel_height - kernel (filter) height. +/// @param kernel_width - kernel (filter) width. +/// @param upsampling_height - height of upsampling region for deconvolution input (deconvolution height stride). +/// @param upsampling_width - width of upsampling region for deconvolution input (deconvolution width stride). +/// @param dilation_height - dilation of kernel elements along the height dimension. +/// @param dilation_width - dilation of kernel elements along the width dimension. +/// @param groups - number of convolution groups. +/// @param group_input_channels - number of input channels per group. +/// @param group_output_channels - number of output channels per group. +/// @param output_min - lower bound for clipping output values. +/// @param output_max - upper bound for clipping output values. +/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph +/// with [N, IH, IW, groups * group_input_channels] dimensions +/// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph +/// with [groups * group_output_channels, kernel_height, kernel_width, group_input_channels] +/// dimensions. +/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a 2D Convolution Node without a bias. If +/// present, the bias tensor must be a 1D tensor defined in the @a subgraph with +/// [groups * group_output_channels] dimensions. +/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph +/// with [N, OH, OW, groups * group_output_channels] dimensions. +/// @param flags - binary features of the 2D Deconvolution Node. No supported flags are currently defined. +enum xnn_status xnn_define_deconvolution_2d( + xnn_subgraph_t subgraph, + uint32_t padding_top, + uint32_t padding_right, + uint32_t padding_bottom, + uint32_t padding_left, + uint32_t adjustment_height, + uint32_t adjustment_width, + uint32_t kernel_height, + uint32_t kernel_width, + uint32_t upsampling_height, + uint32_t upsampling_width, + uint32_t dilation_height, + uint32_t dilation_width, + uint32_t groups, + size_t group_input_channels, + size_t group_output_channels, + float output_min, + float output_max, + uint32_t input_id, + uint32_t filter_id, + uint32_t bias_id, + uint32_t output_id, + uint32_t flags); + +/// Define a 2D Depthwise Convolution Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING +/// flag is specified. +/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if +/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified. +/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if +/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified. +/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if +/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified. +/// @param kernel_height - kernel (filter) height. +/// @param kernel_width - kernel (filter) width. +/// @param subsampling_height - height of subsampling region for convolution output (convolution height stride). +/// @param subsampling_width - width of subsampling region for convolution output (convolution width stride). +/// @param dilation_height - dilation of kernel elements along the height dimension. +/// @param dilation_width - dilation of kernel elements along the width dimension. +/// @param depth_multiplier - ratio of output channels to input channels. +/// @param input_channels - number of input channels. +/// @param output_min - lower bound for clipping output values. +/// @param output_max - upper bound for clipping output values. +/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph +/// with [N, IH, IW, input_channels] dimensions +/// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph +/// with [1, kernel_height, kernel_width, input_channels * depth_multiplier] dimensions. +/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a 2D Depthwise Convolution Node without +/// a bias. If present, the bias tensor must be a 1D tensor defined in the @a subgraph with +/// [input_channels * depth_multiplier] dimensions. +/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph +/// with [N, OH, OW, input_channels * depth_multiplier] dimensions. +/// @param flags - binary features of the 2D Depthwise Convolution Node. The only currently supported values is +/// XNN_FLAG_TENSORFLOW_SAME_PADDING. +enum xnn_status xnn_define_depthwise_convolution_2d( + xnn_subgraph_t subgraph, + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t kernel_height, + uint32_t kernel_width, + uint32_t subsampling_height, + uint32_t subsampling_width, + uint32_t dilation_height, + uint32_t dilation_width, + uint32_t depth_multiplier, + size_t input_channels, + float output_min, + float output_max, + uint32_t input_id, + uint32_t filter_id, + uint32_t bias_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Depth To Space Node 2D and add it to a Subgraph. +/// +/// The Depth To Space 2D Node rearranges data from depth into blocks of spatial data (a reverse transform to +/// Space To Depth). For a given input pixel, an output square of pixels with side @a block_size is formed from values +/// in the corresponding number of its channels. The output depth is therefore @a block_size x @a block_size times +/// smaller than that of the input. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param block_size - the size of the spatial block. +/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph +/// with [N, IH, IW, OC * block_size * block_size] dimensions. +/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph +/// with [N, IH * block_size, IW * block_size, OC] dimensions. +/// @param flags - binary features of the input_channels Node. No supported flags are currently defined. +enum xnn_status xnn_define_depth_to_space_2d( + xnn_subgraph_t subgraph, + uint32_t block_size, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +enum xnn_status xnn_define_depth_to_space( + xnn_subgraph_t subgraph, + uint32_t input_id, + uint32_t output_id, + uint32_t block_size, + uint32_t flags); + +/// Define a 1D Global Average Pooling Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param output_min - lower bound for clipping output values. +/// @param output_max - upper bound for clipping output values. +/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with 2 or more dimensions +/// defined in the @a subgraph. Averaging is performed across the second-innermost dimension. +/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor with 2 or more +/// dimensions defined in the @a subgraph. +/// @param flags - binary features of the 1D Global Average Pooling Node. The only currently supported value is +/// XNN_FLAG_REDUCE_DIMS. +enum xnn_status xnn_define_global_average_pooling_1d( + xnn_subgraph_t subgraph, + float output_min, + float output_max, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a 2D Global Average Pooling Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param output_min - lower bound for clipping output values. +/// @param output_max - upper bound for clipping output values. +/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with 3 or more dimensions +/// defined in the @a subgraph. Averaging is performed across the second- and third-innermost +/// dimensions. +/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor with 3 or more +/// dimensions defined in the @a subgraph. +/// @param flags - binary features of the 2D Global Average Pooling Node. The only currently supported value is +/// XNN_FLAG_REDUCE_DIMS. +enum xnn_status xnn_define_global_average_pooling_2d( + xnn_subgraph_t subgraph, + float output_min, + float output_max, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a 1D Global Sum Pooling Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param output_min - lower bound for clipping output values. +/// @param output_max - upper bound for clipping output values. +/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with 2 or more dimensions +/// defined in the @a subgraph. Averaging is performed across the second-innermost dimension. +/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor with 2 or more +/// dimensions defined in the @a subgraph. +/// @param flags - binary features of the 1D Global Sum Pooling Node. The only currently supported value is +/// XNN_FLAG_REDUCE_DIMS. +enum xnn_status xnn_define_global_sum_pooling_1d( + xnn_subgraph_t subgraph, + float output_min, + float output_max, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a 2D Global Sum Pooling Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param output_min - lower bound for clipping output values. +/// @param output_max - upper bound for clipping output values. +/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with 3 or more dimensions +/// defined in the @a subgraph. Averaging is performed across the second- and third-innermost +/// dimensions. +/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor with 3 or more +/// dimensions defined in the @a subgraph. +/// @param flags - binary features of the 2D Global Sum Pooling Node. The only currently supported value is +/// XNN_FLAG_REDUCE_DIMS. +enum xnn_status xnn_define_global_sum_pooling_2d( + xnn_subgraph_t subgraph, + float output_min, + float output_max, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a 2D Average Pooling Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING +/// flag is specified. +/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if +/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified. +/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if +/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified. +/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if +/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified. +/// @param pooling_height - pooling (kernel) height. +/// @param pooling_width - pooling (kernel) width. +/// @param stride_height - displacing of the pooling window in the vertical dimension of the input pixels corresponding +/// to vertically adjacent output pixels. +/// @param stride_width - displacing of the pooling window in the horizontal dimension of the input pixels corresponding +/// to horizontally adjacent output pixels. +/// @param output_min - lower bound for clipping output values. +/// @param output_max - upper bound for clipping output values. +/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph +/// with [N, IH, IW, channels] dimensions +/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph +/// with [N, OH, OW, channels] dimensions. +/// @param flags - binary features of the 2D Average Pooling Node. The only currently supported values is +/// XNN_FLAG_TENSORFLOW_SAME_PADDING. +enum xnn_status xnn_define_average_pooling_2d( + xnn_subgraph_t subgraph, + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t pooling_height, + uint32_t pooling_width, + uint32_t stride_height, + uint32_t stride_width, + float output_min, + float output_max, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Fully Connected Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param output_min - lower bound for clipping output values. +/// @param output_max - upper bound for clipping output values. +/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the +/// @a subgraph. If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the input tensor must be at least +/// 1D and its last dimension must match the last dimension of the filter tensor. In particular, if +/// input is a 2D tensor, it must have [batch_size, input_channels] dimensions. +/// If XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, the number of elements in the input tensor must be +/// divisible by the input_channels. The tensor will be first flattened into a 1D tensor of +/// [num_input_elements] dimensions, then reshaped into a 2D tensor of +/// [num_input_elements / input_channels, input_channels] dimensions where num_input_elements is the +/// total number of elements in the input tensor. +/// @param filter_id - Value ID for the filter tensor. The filter tensor must a 2D tensor defined in the @a subgraph. +/// If the XNN_FLAG_TRANSPOSE_WEIGHTS flag is not specified, the filter tensor must have +/// [output_channels, input_channels] dimensions. If the XNN_FLAG_TRANSPOSE_WEIGHTS flag is +/// specified, the filter tensor must have [input_channels, output_channels] dimensions. +/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a Fully Connected Node without a bias. +/// If present, the bias tensor must be a 1D tensor defined in the @a subgraph with [output_channels] +/// dimensions. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph. +/// If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the output tensor must have the same +/// dimensionality as the input tensor, all its dimensions but the last one must match the +/// corresponding dimensions of the input tensor, and the last dimensions of the output tensor must +/// match the first dimension of the filter tensor. In particular, if input is a 2D tensor, output +/// must be a 2D tensor of [batch_size, output_channels] dimensions. +/// If XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, output must be a 2D tensor of +/// [num_input_elements / input_channels, output_channels] dimensions where num_input_elements is the +/// total number of elements in the input tensor. +/// @param flags - binary features of the Fully Connected Node. The only currently supported values are +/// XNN_FLAG_TENSORFLOW_RESHAPE_2D and XNN_FLAG_TRANSPOSE_WEIGHTS. +enum xnn_status xnn_define_fully_connected( + xnn_subgraph_t subgraph, + float output_min, + float output_max, + uint32_t input_id, + uint32_t filter_id, + uint32_t bias_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Sparse Fully Connected Node and add it to a Subgraph. +/// +/// This operator is experimental, and will be removed in the future. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param output_min - lower bound for clipping output values. +/// @param output_max - upper bound for clipping output values. +/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the +/// @a subgraph. If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the input tensor must be at least +/// 1D and its last dimension must match the last dimension of the filter tensor. In particular, if +/// input is a 2D tensor, it must have [batch_size, input_channels] dimensions. +/// If XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, the number of elements in the input tensor must be +/// divisible by the input_channels. The tensor will be first flattened into a 1D tensor of +/// [num_input_elements] dimensions, then reshaped into a 2D tensor of +/// [num_input_elements / input_channels, input_channels] dimensions where num_input_elements is the +/// total number of elements in the input tensor. +/// @param filter_id - Value ID for the filter tensor. The filter tensor must a 2D tensor defined in the @a subgraph. +/// If the XNN_FLAG_TRANSPOSE_WEIGHTS flag is not specified, the filter tensor must have +/// [output_channels, input_channels] dimensions. If the XNN_FLAG_TRANSPOSE_WEIGHTS flag is +/// specified, the filter tensor must have [input_channels, output_channels] dimensions. +/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a Fully Connected Node without a bias. +/// If present, the bias tensor must be a 1D tensor defined in the @a subgraph with [output_channels] +/// dimensions. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph. +/// If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the output tensor must have the same +/// dimensionality as the input tensor, all its dimensions but the last one must match the +/// corresponding dimensions of the input tensor, and the last dimensions of the output tensor must +/// match the first dimension of the filter tensor. In particular, if input is a 2D tensor, output +/// must be a 2D tensor of [batch_size, output_channels] dimensions. +/// If XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, output must be a 2D tensor of +/// [num_input_elements / input_channels, output_channels] dimensions where num_input_elements is the +/// total number of elements in the input tensor. +/// @param flags - binary features of the Fully Connected Node. The only currently supported values are +/// XNN_FLAG_TENSORFLOW_RESHAPE_2D and XNN_FLAG_TRANSPOSE_WEIGHTS. +enum xnn_status xnn_define_fully_connected_sparse( + xnn_subgraph_t subgraph, + float output_min, + float output_max, + uint32_t input_id, + uint32_t filter_id, + uint32_t bias_id, + uint32_t output_id, + uint32_t flags); + +/// Define a 2D Max Pooling Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING +/// flag is specified. +/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if +/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified. +/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if +/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified. +/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if +/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified. +/// @param pooling_height - pooling (kernel) height. +/// @param pooling_width - pooling (kernel) width. +/// @param stride_height - displacing of the pooling window in the vertical dimension of the input pixels corresponding +/// to vertically adjacent output pixels. +/// @param stride_width - displacing of the pooling window in the horizontal dimension of the input pixels corresponding +/// to horizontally adjacent output pixels. +/// @param dilation_height - dilation of pooling elements along the height dimension. +/// @param dilation_width - dilation of pooling elements along the width dimension. +/// @param output_min - lower bound for clipping output values. +/// @param output_max - upper bound for clipping output values. +/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph +/// with [N, IH, IW, channels] dimensions +/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph +/// with [N, OH, OW, channels] dimensions. +/// @param flags - binary features of the 2D Max Pooling Node. The only currently supported values is +/// XNN_FLAG_TENSORFLOW_SAME_PADDING. +enum xnn_status xnn_define_max_pooling_2d( + xnn_subgraph_t subgraph, + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t pooling_height, + uint32_t pooling_width, + uint32_t stride_height, + uint32_t stride_width, + uint32_t dilation_height, + uint32_t dilation_width, + float output_min, + float output_max, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a 2D ArgMax Pooling Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_padding_top - implicit zero-padding above 2D input data. +/// @param input_padding_right - implicit zero-padding to the right of 2D input data. +/// @param input_padding_bottom - implicit zero-padding below 2D input data. +/// @param input_padding_left - implicit zero-padding to the left of 2D input data. +/// @param pooling_height - pooling (kernel) height. Vertical stride between pooling regions match this value. +/// @param pooling_width - pooling (kernel) width. Horizontal stride between pooling regions match this value. +/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph +/// with [N, IH, IW, channels] dimensions +/// @param output_value_id - Value ID for the output tensor with the maximum values in the pools. The output tensor must +/// be a 4D tensor defined in the @a subgraph with [N, OH, OW, channels] dimensions. +/// @param output_index_id - Value ID for the output tensor with the indexes of the maximum values in the pools. The +/// output tensor must be a 4D tensor defined in the @a subgraph with [N, OH, OW, channels] +/// dimensions. +/// @param flags - binary features of the 2D ArgMax Pooling Node. No supported flags are currently defined. +enum xnn_status xnn_define_argmax_pooling_2d( + xnn_subgraph_t subgraph, + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t pooling_height, + uint32_t pooling_width, + uint32_t input_id, + uint32_t output_value_id, + uint32_t output_index_id, + uint32_t flags); + +/// Define a 2D UnPooling Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param padding_top - implicit padding above 2D output data. +/// @param padding_right - implicit padding to the right of 2D output data. +/// @param padding_bottom - implicit padding below 2D output data. +/// @param padding_left - implicit padding to the left of 2D output data. +/// @param pooling_height - height of the pooling window. +/// @param pooling_width - width of the pooling window. +/// @param input_value_id - Value ID for the input tensor with the max-pooling values to invert. The input value tensor +/// must be a 4D tensor defined in the @a subgraph with [N, IH, IW, channels] dimensions. +/// @param input_index_id - Value ID for the input tensor with the indices of the per-pool maximum values produced by +/// a 2D UnPooling Node. The input tensor must be a 4D tensor defined in the @a subgraph with +/// [N, IH, IW, channels] dimensions. +/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph +/// with [N, OH, OW, channels] dimensions. +/// @param flags - binary features of the 2D UnPooling Node. No supported flags are currently defined. +enum xnn_status xnn_define_unpooling_2d( + xnn_subgraph_t subgraph, + uint32_t padding_top, + uint32_t padding_right, + uint32_t padding_bottom, + uint32_t padding_left, + uint32_t pooling_height, + uint32_t pooling_width, + uint32_t input_value_id, + uint32_t input_index_id, + uint32_t output_id, + uint32_t flags); + +/// Define a 2-Input Add Node and add it to a Subgraph. +/// +/// The 2-Input Add Node computes elementwise addition of two tensor inputs with numpy broadcasting rules. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param output_min - lower bound for clipping output values. +/// @param output_max - upper bound for clipping output values. +/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in +/// the @a subgraph with each dimension either equal to the corresponding dimension of the second +/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along +/// that dimension. +/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in +/// the @a subgraph with each dimension either equal to the corresponding dimension of the first +/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along +/// that dimension. +/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined +/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension +/// of the two inputs. +/// @param flags - binary features of the Add Node. No supported flags are currently defined. +enum xnn_status xnn_define_add2( + xnn_subgraph_t subgraph, + float output_min, + float output_max, + uint32_t input1_id, + uint32_t input2_id, + uint32_t output_id, + uint32_t flags); + +/// Define a 2-Input Multiply Node and add it to a Subgraph. +/// +/// The 2-Input Multiply Node computes elementwise multiplication of two tensor inputs with numpy broadcasting rules. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param output_min - lower bound for clipping output values. +/// @param output_max - upper bound for clipping output values. +/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in +/// the @a subgraph with each dimension either equal to the corresponding dimension of the second +/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along +/// that dimension. +/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in +/// the @a subgraph with each dimension either equal to the corresponding dimension of the first +/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along +/// that dimension. +/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined +/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension +/// of the two inputs. +/// @param flags - binary features of the Multiply Node. No supported flags are currently defined. +enum xnn_status xnn_define_multiply2( + xnn_subgraph_t subgraph, + float output_min, + float output_max, + uint32_t input1_id, + uint32_t input2_id, + uint32_t output_id, + uint32_t flags); + +// Cap operations applied to logits (Q * K) of attention operator. +enum xnn_attention_logits_cap_type { + // No capping. + xnn_attention_logits_cap_type_none = 0, + // Cap the absolute values of logits by tanh: tanh(logits / cap) * cap + xnn_attention_logits_cap_type_tanh +}; + +// Params when the cap type is xnn_attention_logits_cap_type_tanh. +struct xnn_attention_logits_cap_tanh_params { + float cap; +}; + +/// Define a Scaled Dot-Product Attention Node and add it to a Subgraph. +/// +/// This operator is experimental. +/// +/// The Scaled Dot-Product Attention Node computes a multi-head or multi-query scaled dot attention on the query, key, +/// and value tensors. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param cap_type - type of cap to be applied to the logits. +/// @param cap_params - parameters for the cap. Must be a pointer to xnn_attention_logits_cap_tanh_params if cap_type +/// is xnn_attention_logits_cap_type_tanh. +/// @param query_id - Value ID for the query tensor. The query tensor must be a 3+-dimensional tensor defined in the +/// @a subgraph with the dimensions as [*, H, T, C], where H/T/C are the heads/tokens/channels, and * +/// is the 0 or more dimensions treated as batch size. +/// @param key_id - Value ID for the key tensor. The key tensor must be a 2+--dimensional tensor defined in the +/// @a subgraph. It can have the same number of dimensions as the query, with the dimensions as +/// [*, H, U, C] (multi-head), or have 1 less dimension than the query, with the dimensions as +/// as [*, U, C] (multi-query, number of heads omitted implies single head), where H/U/C are the +/// heads/key_value_tokens/channels, and * is the 0 or more dimensions treated as batch size. These +/// batch size dimensions must be the same as query. +/// @param value_id - Value ID for the value tensor. The value tensor must be a 2+--dimensional tensor defined in the +/// @a subgraph. It can have the same number of dimensions as the query, with the dimensions as +/// [*, H, U, D] (multi-head), or have 1 less dimension than the query, with the dimensions as +/// as [*, U, D] (multi-query, number of heads omitted implies single head), where H/U/D are the +/// heads/key_value_tokens/value_channels, and * is the 0 or more dimensions treated as batch size. +/// These batch size dimensions must be the same as query and key. +/// @param scale_id - Value ID for the scale tensor. The scale tensor must be a 1D tensor defined in the @a subgraph +/// with [C] dimensions. The query tensor is multiplied with this scale tensor before the dot product +/// with the key tensor. +/// @param mask_id - Value ID for the mask tensor. The mask tensor must be a 2D tensor defined in the @a subgraph with +/// [T, U] dimensions. The mask tensor is added to the logits (query dot value). +/// @param output_id - Value ID for the output tensor. The output tensor must be a 3+-dimensional tensor defined in the +/// @a subgraph with the dimensions as [*, H, T, D], where H/T/D are the heads/tokens/value_channels, +/// and * is the 0 or more dimensions treated as batch size. These batch size dimensions must be the +/// same as query, key, and value. +/// @param flags - binary features of the Scaled Dot Product Attention Node. No supported flags are currently defined. +enum xnn_status xnn_define_scaled_dot_product_attention( + xnn_subgraph_t subgraph, + enum xnn_attention_logits_cap_type cap_type, + const void* cap_params, + uint32_t query_id, + uint32_t key_id, + uint32_t value_id, + uint32_t scale_id, + uint32_t mask_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Subtract Node and add it to a Subgraph. +/// +/// The Subtract Node computes elementwise subtraction of two tensor inputs with numpy broadcasting rules. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param output_min - lower bound for clipping output values. +/// @param output_max - upper bound for clipping output values. +/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in +/// the @a subgraph with each dimension either equal to the corresponding dimension of the second +/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along +/// that dimension. +/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in +/// the @a subgraph with each dimension either equal to the corresponding dimension of the first +/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along +/// that dimension. +/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined +/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension +/// of the two inputs. +/// @param flags - binary features of the Subtract Node. No supported flags are currently defined. +enum xnn_status xnn_define_subtract( + xnn_subgraph_t subgraph, + float output_min, + float output_max, + uint32_t input1_id, + uint32_t input2_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Divide Node and add it to a Subgraph. +/// +/// The Divide Node computes elementwise division of two tensor inputs with numpy broadcasting rules. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param output_min - lower bound for clipping output values. +/// @param output_max - upper bound for clipping output values. +/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in +/// the @a subgraph with each dimension either equal to the corresponding dimension of the second +/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along +/// that dimension. +/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in +/// the @a subgraph with each dimension either equal to the corresponding dimension of the first +/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along +/// that dimension. +/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined +/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension +/// of the two inputs. +/// @param flags - binary features of the Divide Node. No supported flags are currently defined. +enum xnn_status xnn_define_divide( + xnn_subgraph_t subgraph, + float output_min, + float output_max, + uint32_t input1_id, + uint32_t input2_id, + uint32_t output_id, + uint32_t flags); + +/// Define a 2-Input Maximum Node and add it to a Subgraph. +/// +/// The 2-Input Maximum Node computes elementwise maximum of two tensor inputs with numpy broadcasting rules. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in +/// the @a subgraph with each dimension either equal to the corresponding dimension of the second +/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along +/// that dimension. +/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in +/// the @a subgraph with each dimension either equal to the corresponding dimension of the first +/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along +/// that dimension. +/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined +/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension +/// of the two inputs. +/// @param flags - binary features of the Maximum Node. No supported flags are currently defined. +enum xnn_status xnn_define_maximum2( + xnn_subgraph_t subgraph, + uint32_t input1_id, + uint32_t input2_id, + uint32_t output_id, + uint32_t flags); + +/// Define a 2-Input Minimum Node and add it to a Subgraph. +/// +/// The 2-Input Minimum Node computes elementwise minimum of two tensor inputs with numpy broadcasting rules. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in +/// the @a subgraph with each dimension either equal to the corresponding dimension of the second +/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along +/// that dimension. +/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in +/// the @a subgraph with each dimension either equal to the corresponding dimension of the first +/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along +/// that dimension. +/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined +/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension +/// of the two inputs. +/// @param flags - binary features of the Minimum Node. No supported flags are currently defined. +enum xnn_status xnn_define_minimum2( + xnn_subgraph_t subgraph, + uint32_t input1_id, + uint32_t input2_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Squared Difference Node and add it to a Subgraph. +/// +/// The Squared Difference Node computes elementwise squared difference of two tensor inputs with numpy broadcasting +/// rules. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in +/// the @a subgraph with each dimension either equal to the corresponding dimension of the second +/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along +/// that dimension. +/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in +/// the @a subgraph with each dimension either equal to the corresponding dimension of the first +/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along +/// that dimension. +/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined +/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension +/// of the two inputs. +/// @param flags - binary features of the Squared Difference Node. No supported flags are currently defined. +enum xnn_status xnn_define_squared_difference( + xnn_subgraph_t subgraph, + uint32_t input1_id, + uint32_t input2_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Constant Pad Node with static padding specification and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param pre_paddings - number of padding elements to insert before input elements for every dimension. This array +/// must have as many elements as the number of dimensions in the input tensor. +/// @param post_paddings - number of padding elements to insert after input elements for every dimension. This array +/// must have as many elements as the number of dimensions in the input tensor. +/// @param padding_value - constant value used to initialize padding elements. +/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its +/// shape must match the shape of the input tensor with padding. +/// @param flags - binary features of the Constant Pad Node. No supported flags are currently defined. +enum xnn_status xnn_define_static_constant_pad( + xnn_subgraph_t subgraph, + const size_t* pre_paddings, + const size_t* post_paddings, + float padding_value, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Mean Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param num_reduction_axes - number of axes along which mean is computed. +/// @param reduction_axes - axes along which mean is computed. +/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with at least +/// @a num_reduction_axes dimensions defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor defined in the +/// @a subgraph with @a num_reduction_axes fewer dimensions than the input tensor (if +/// XNN_FLAG_REDUCE_DIMS is specified), or has same dimension rank but the dimension at +/// @a reduction_axes reduced to 1 (if XNN_FLAG_REDUCE_DIMS is not specified). +/// @param flags - binary features of the Mean Node. The only currently supported value is XNN_FLAG_REDUCE_DIMS +enum xnn_status xnn_define_static_mean( + xnn_subgraph_t subgraph, + size_t num_reduction_axes, + const size_t* reduction_axes, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a 2-Input Concatenate Node and add it to a Subgraph. +/// +/// The 2-Input Concatenate Node concatenates two tensors along a specified axis. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param axis - the axis to concatenate the two input tensors along +/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in +/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the +/// second input. +/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in +/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the +/// first input. +/// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined +/// in the @a subgraph with each dimension equal to the dimension of both inputs, except the axis +/// dimension, where it is the sum of the corresponding dimensions of both inputs. +/// @param flags - binary features of the Concatenate Node. No supported flags are currently defined. +enum xnn_status xnn_define_concatenate2( + xnn_subgraph_t subgraph, + size_t axis, + uint32_t input1_id, + uint32_t input2_id, + uint32_t output_id, + uint32_t flags); + +/// Define a 3-Input Concatenate Node and add it to a Subgraph. +/// +/// The 3-Input Concatenate Node concatenates three tensors along a specified axis. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param axis - the axis to concatenate the three input tensors along +/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in +/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the +/// other inputs. +/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in +/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the +/// other inputs. +/// @param input3_id - Value ID for the third input tensor. The input tensor must be an N-dimensional tensor defined in +/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the +/// other inputs. +/// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined +/// in the @a subgraph with each dimension equal to the dimension of all inputs, except the axis +/// dimension, where it is the sum of the corresponding dimensions of all inputs. +/// @param flags - binary features of the Concatenate Node. No supported flags are currently defined. +enum xnn_status xnn_define_concatenate3( + xnn_subgraph_t subgraph, + size_t axis, + uint32_t input1_id, + uint32_t input2_id, + uint32_t input3_id, + uint32_t output_id, + uint32_t flags); + +/// Define a 4-Input Concatenate Node and add it to a Subgraph. +/// +/// The 4-Input Concatenate Node concatenates four tensors along a specified axis. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param axis - the axis to concatenate the four input tensors along +/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in +/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the +/// other inputs. +/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in +/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the +/// other inputs. +/// @param input3_id - Value ID for the third input tensor. The input tensor must be an N-dimensional tensor defined in +/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the +/// other inputs. +/// @param input4_id - Value ID for the fourth input tensor. The input tensor must be an N-dimensional tensor defined in +/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the +/// other inputs. +/// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined +/// in the @a subgraph with each dimension equal to the dimension of all inputs, except the axis +/// dimension, where it is the sum of the corresponding dimensions of all inputs. +/// @param flags - binary features of the Concatenate Node. No supported flags are currently defined. +enum xnn_status xnn_define_concatenate4( + xnn_subgraph_t subgraph, + size_t axis, + uint32_t input1_id, + uint32_t input2_id, + uint32_t input3_id, + uint32_t input4_id, + uint32_t output_id, + uint32_t flags); + +enum xnn_status xnn_define_concatenate5( + xnn_subgraph_t subgraph, + size_t axis, + uint32_t input1_id, + uint32_t input2_id, + uint32_t input3_id, + uint32_t input4_id, + uint32_t input5_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Copy Node and add it to a Subgraph. +/// +/// The Copy Node copies an input tensor to an output tensor. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_id - Value ID for the first input tensor. The input tensor must be defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its +/// shape must match the shape of the input tensor. +/// @param flags - binary features of the Copy Node. No supported flags are currently defined. +enum xnn_status xnn_define_copy( + xnn_subgraph_t subgraph, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a 2-Output Split Node and add it to a Subgraph. +/// +/// The 2-Output Split Node splits an input tensor into two output tensors along a specified axis evenly. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param split_dim - the dimension to split the input tensor along +/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the @a +/// subgraph. +/// @param output1_id - Value ID for the first output tensor. The output tensor must be an N-dimensional tensor defined +/// in the @a subgraph with each dimension, except the axis, equal to the corresponding dimension +/// of the second output. The split_dim dimension is half of the input's split_dim. +/// @param output2_id - Value ID for the second output tensor. The output tensor must be an N-dimensional tensor +/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding +/// dimension of the first output. The split_dim dimension is half of the input's split_dim. +/// @param flags - binary features of the Split Node. No supported flags are currently defined. +enum xnn_status xnn_define_even_split2( + xnn_subgraph_t subgraph, + size_t split_dim, + uint32_t input_id, + uint32_t output1_id, + uint32_t output2_id, + uint32_t flags); + +/// Define a 3-Output Split Node and add it to a Subgraph. +/// +/// The 3-Output Split Node splits an input tensor into three output tensors along a specified axis evenly. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param split_dim - the dimension to split the input tensor along +/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the @a +/// subgraph. +/// @param output1_id - Value ID for the first output tensor. The output tensor must be an N-dimensional tensor defined +/// in the @a subgraph with each dimension, except the axis, equal to the corresponding dimension +/// of the second and third output. The split_dim dimension is one third of the input's split_dim. +/// @param output2_id - Value ID for the second output tensor. The output tensor must be an N-dimensional tensor +/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding +/// dimension of the first and third output. The split_dim dimension is one third of the input's +/// split_dim. +/// @param output3_id - Value ID for the third output tensor. The output tensor must be an N-dimensional tensor +/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding +/// dimension of the second and third output. The split_dim dimension is one third of the input's +/// split_dim. +/// @param flags - binary features of the Split Node. No supported flags are currently defined. +enum xnn_status xnn_define_even_split3( + xnn_subgraph_t subgraph, + size_t split_dim, + uint32_t input_id, + uint32_t output1_id, + uint32_t output2_id, + uint32_t output3_id, + uint32_t flags); + +/// Define a 4-Output Split Node and add it to a Subgraph. +/// +/// The 4-Output Split Node splits an input tensor into four output tensors along a specified axis evenly. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param split_dim - the dimension to split the input tensor along +/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the @a +/// subgraph. +/// @param output1_id - Value ID for the first output tensor. The output tensor must be an N-dimensional tensor defined +/// in the @a subgraph with each dimension, except the axis, equal to the corresponding dimension +/// of the other output tensors. The split_dim dimension is one fourth of the input's split_dim. +/// @param output2_id - Value ID for the second output tensor. The output tensor must be an N-dimensional tensor +/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding +/// dimension of the other output tensors. The split_dim dimension is one fourth of the input's +/// split_dim. +/// @param output3_id - Value ID for the third output tensor. The output tensor must be an N-dimensional tensor +/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding +/// dimension of the other output tensors. The split_dim dimension is one fourth of the input's +/// split_dim. +/// @param output4_id - Value ID for the fourth output tensor. The output tensor must be an N-dimensional tensor +/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding +/// dimension of the other output tensors. The split_dim dimension is one fourth of the input's +/// split_dim. +/// @param flags - binary features of the Split Node. No supported flags are currently defined. +enum xnn_status xnn_define_even_split4( + xnn_subgraph_t subgraph, + size_t split_dim, + uint32_t input_id, + uint32_t output1_id, + uint32_t output2_id, + uint32_t output3_id, + uint32_t output4_id, + uint32_t flags); + +/// Define a Reshape Node with static shape specification and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param num_dims - number of shape dimensions in the output tensor. +/// @param new_shape - shape dimensions of the output tensor. +/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its +/// shape must match the shape of the input tensor with padding. +/// @param flags - binary features of the Reshape Node. No supported flags are currently defined. +enum xnn_status xnn_define_static_reshape( + xnn_subgraph_t subgraph, + size_t num_dims, + const size_t* new_shape, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Node that reshapes a tensor to two dimensions, retaining the +/// trailing dimension, and add it to a Subgraph. +/// +/// This operator is experimental. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_id - Value ID for the input tensor. The input tensor must be +/// defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be +/// defined in the @a subgraph, and its +/// size must match the shape of the input tensor with +/// padding. +/// @param flags - binary features of the Reshape Node. No supported flags are +/// currently defined. +enum xnn_status xnn_define_reshape_2d(xnn_subgraph_t subgraph, + uint32_t input_id, uint32_t output_id, + uint32_t flags); + +/// Define a 2D Resize Bilinear Node with static output height & width specification and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param new_height - height dimension of the output tensor. +/// @param new_width - width dimension of the output tensor. +/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph +/// with [N, H, W, C] dimensions. +/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph +/// with [N, new_height, new_width, C] dimensions. +/// @param flags - binary features of the 2D Resize Bilinear Node. The only currently supported values are +/// XNN_FLAG_TENSORFLOW_LEGACY_MODE and XNN_FLAG_ALIGN_CORNERS, which are mutually exclusive. +enum xnn_status xnn_define_static_resize_bilinear_2d( + xnn_subgraph_t subgraph, + size_t new_height, + size_t new_width, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a PReLU (Parametric ReLU) Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph +/// with [N, H, W, channels] dimensions. +/// @param slope_id - Value ID for the slope tensor. The slope tensor must be a 1D tensor defined in the @a subgraph with +/// [channels] dimensions. +/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph +/// with [N, H, W, channels] dimensions. +/// @param flags - binary features of the PReLU Node. No supported flags are currently defined. +enum xnn_status xnn_define_prelu( + xnn_subgraph_t subgraph, + uint32_t input_id, + uint32_t slope_id, + uint32_t output_id, + uint32_t flags); + +/// Define a RoPE (Rotary Positional Embeddings) Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param max_tokens - maximum possible number of tokens (maximum sequence length) of the input/output tensors. +/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph +/// with [batch, tokens, heads, channels] dimensions. +/// @param weights_id - Value ID for the weights tensor. The weights tensor must be a 2D tensor defined in the +/// @a subgraph with [max_tokens, channels] dimensions. +/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph +/// with [batch, tokens, heads, channels] dimensions. +/// @param flags - binary features of the RoPE Node. No supported flags are currently defined. +enum xnn_status xnn_define_rope( + xnn_subgraph_t subgraph, + size_t max_sequence_size, + uint32_t input_id, + uint32_t weights_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Abs Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its +/// shape must match the shape of the input tensor. +/// @param flags - binary features of the Abs Node. No supported flags are currently defined. +enum xnn_status xnn_define_abs( + xnn_subgraph_t subgraph, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Bankers' Rounding Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its +/// shape must match the shape of the input tensor. +/// @param flags - binary features of the Bankers' Rounding Node. No supported flags are currently defined. +enum xnn_status xnn_define_bankers_rounding( + xnn_subgraph_t subgraph, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Batch Matrix Multiply Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in +/// the @a subgraph. It must be at least 3D. The first N-2 dimensions must match the second input +/// tensor. The last 2 dimensions are [M, K]. If XNN_FLAG_TRANSPOSE_B is not specified, the last +/// dimension must match the second last dimension of the second input tensor. If +/// XNN_FLAG_TRANSPOSE_B is specified, the last dimension must match the last dimension of the +/// second input tensor. +/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined +/// in the @a subgraph. It must be at least 3D. The first N-2 dimensions must match the first input +/// tensor. If XNN_FLAG_TRANSPOSE_B is not specified, the last 2 dimensions are [K, N], and the +/// second last dimension must match the last dimension of the first input tensor. If +/// XNN_FLAG_TRANSPOSE_B is specified, the last 2 dimensions are [N, K], and the last dimension must +/// match the last dimension of the first input tensor. +/// @param output_id - Value ID for the output tensor. The output tensor must be an N-dimensional tensor defined in the +/// @a subgraph. It must be at least 3D. The first N-2 dimensions must match the first and second +/// input tensors . The last 2 dimensions must be [M, N]. +/// @param flags - binary features of the Batch Matrix Multiply Node. The only currently supported value is +/// XNN_FLAG_TRANSPOSE_B. +enum xnn_status xnn_define_batch_matrix_multiply( + xnn_subgraph_t subgraph, + uint32_t input1_id, + uint32_t input2_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Ceiling Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its +/// shape must match the shape of the input tensor. +/// @param flags - binary features of the Ceiling Node. No supported flags are currently defined. +enum xnn_status xnn_define_ceiling( + xnn_subgraph_t subgraph, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Clamp Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param output_min - lower bound for clipping output values. +/// @param output_max - upper bound for clipping output values. +/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its +/// shape must match the shape of the input tensor. +/// @param flags - binary features of the Clamp Node. No supported flags are currently defined. +enum xnn_status xnn_define_clamp( + xnn_subgraph_t subgraph, + float output_min, + float output_max, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define an ELU (Exponential Linear Unit) Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param alpha - scale factor for negative output elements. +/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its +/// shape must match the shape of the input tensor. +/// @param flags - binary features of the ELU Node. No supported flags are currently defined. +enum xnn_status xnn_define_elu( + xnn_subgraph_t subgraph, + float alpha, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Floor Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its +/// shape must match the shape of the input tensor. +/// @param flags - binary features of the Floor Node. No supported flags are currently defined. +enum xnn_status xnn_define_floor( + xnn_subgraph_t subgraph, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a HardSwish Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its +/// shape must match the shape of the input tensor. +/// @param flags - binary features of the HardSwish Node. No supported flags are currently defined. +enum xnn_status xnn_define_hardswish( + xnn_subgraph_t subgraph, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Leaky ReLU Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param negative_slope - scale factor for negative input elements. +/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its +/// shape must match the shape of the input tensor. +/// @param flags - binary features of the Leaky ReLU Node. No supported flags are currently defined. +enum xnn_status xnn_define_leaky_relu( + xnn_subgraph_t subgraph, + float negative_slope, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Negate Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its +/// shape must match the shape of the input tensor. +/// @param flags - binary features of the Negate Node. No supported flags are currently defined. +enum xnn_status xnn_define_negate( + xnn_subgraph_t subgraph, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Sigmoid Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its +/// shape must match the shape of the input tensor. +/// @param flags - binary features of the Sigmoid Node. No supported flags are currently defined. +enum xnn_status xnn_define_sigmoid( + xnn_subgraph_t subgraph, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a SoftMax Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph, and have at +/// least one dimension. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its +/// shape must match the shape of the input tensor. +/// @param flags - binary features of the SoftMax Node. No supported flags are currently defined. +enum xnn_status xnn_define_softmax( + xnn_subgraph_t subgraph, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Space To Depth 2D Node and add it to a Subgraph. +/// +/// The Space To Depth 2D Node rearranges blocks of spatial data into blocks (a reverse transform to Depth To Space 2D). +/// For a given input pixel, an output square of pixels with side @a block_size is formed from values in the +/// corresponding number of its channels. The output depth is therefore @a block_size x @a block_size times greater +/// than that of the input. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param block_size - the size of the spatial block. +/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph +/// with [N, IH * block_size, IW * block_size, OC] dimensions. +/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph +/// with [N, IH, IW, OC * block_size * block_size] dimensions. +/// @param flags - binary features of the input_channels Node. No supported flags are currently defined. +enum xnn_status xnn_define_space_to_depth_2d( + xnn_subgraph_t subgraph, + uint32_t block_size, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Square Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its +/// shape must match the shape of the input tensor. +/// @param flags - binary features of the Square Node. No supported flags are currently defined. +enum xnn_status xnn_define_square( + xnn_subgraph_t subgraph, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Square Root Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its +/// shape must match the shape of the input tensor. +/// @param flags - binary features of the Square Root Node. No supported flags are currently defined. +enum xnn_status xnn_define_square_root( + xnn_subgraph_t subgraph, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Reciprocal Square Root Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_id - Value ID for the input tensor. The input tensor must be +/// defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be +/// defined in the @a subgraph, and its +/// shape must match the shape of the input tensor. +/// @param flags - binary features of the Square Root Node. No supported flags +/// are currently defined. +enum xnn_status xnn_define_reciprocal_square_root(xnn_subgraph_t subgraph, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Static Slice Node add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param num_dims - number of shape dimensions in the input and output tensor. +/// @param offsets - offsets in each dimension of the input tensor. This array must have @a num_dims elements. +/// @param sizes - size of each dimension in output tensor. This array must have @a num_dims elements. +/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its +/// dimensions must match @a sizes. +/// @param flags - binary features of the Static Slice Node. No supported flags are currently defined. +enum xnn_status xnn_define_static_slice( + xnn_subgraph_t subgraph, + size_t num_dims, + const size_t* offsets, + const size_t* sizes, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Static Transpose Node and add it to a Subgraph. +/// +/// The Static Transpose Node applies a generalized transpose to the input tensor using the permuation in perm. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in +/// the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be an N-dimensional tensor defined +/// in the @a subgraph with each dimension equal to its corresponding permuted input dimension. +/// @param num_dims - the number of permutation dimensions. This must be equal to the number of input dimensions. +/// @param perm - The permutation of the axis of the input tensor. The perm array must must contain 0 to N-1 in the +/// permuted order. +/// @param flags - binary features of the Static Transpose Node. No supported flags are currently defined. +enum xnn_status xnn_define_static_transpose( + xnn_subgraph_t subgraph, + size_t num_dims, + const size_t* perm, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Define a Tanh Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its +/// shape must match the shape of the input tensor. +/// @param flags - binary features of the Tanh Node. No supported flags are currently defined. +enum xnn_status xnn_define_tanh( + xnn_subgraph_t subgraph, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + +/// Code cache is a cache for JIT generated code. +typedef struct xnn_code_cache* xnn_code_cache_t; + +/// Weights cache can be finalized in these ways: +enum xnn_weights_cache_finalization_kind { + /// Weights cache is finalized, no insert operations into the weights cache is allowed, even if the "inserted" + /// weights already exist in thee cache. Weights cache memory will also be trimmed to page boundary and set to + /// read-only (to prevent writes). + xnn_weights_cache_finalization_kind_hard, + /// Weights cache will be finalized with some extra space at the end, this allows for "inserting" into the cache only + /// if the weights are already in the cache, and errors on inserting uncached weights. There is memory overhead. + xnn_weights_cache_finalization_kind_soft, +}; + +/// A combination of multiple factors to uniquely locate the weights cache. +struct xnn_weights_cache_look_up_key { + /// The unique seed for each ukernel. It is guaranteed that each ukernel provides + /// a consistent and identical seed. + uint32_t seed; + /// Pointer to the original kernel. + const void* kernel; + /// Pointer to the original bias, could be NULL. + const void* bias; +}; + +/// A group of function pointers to manage weights cache. All functions may be +/// called on multi threads. +struct xnn_weights_cache_provider { + /// User-specified pointer that will be passed as-is to all functions in this + /// structure. + void* context; + + /// Looks up the tuple of {cache_key, kernel, bias} in the cache. If it is found, + /// returns the offset to the found entry for reuse. Otherwise, returns SIZE_MAX. + /// @param context - The user-specified pointer from xnn_weights_cache_provider structure. + /// @param cache_key - The key used to locate the weights cache entry. + size_t (*look_up)(void* context, const struct xnn_weights_cache_look_up_key* cache_key); + + /// Ensures that cache has enough space for `n` bytes. Returns the address to + /// store weight cache. Returns NULL if fails to reserve space. + /// @param context - The user-specified pointer from xnn_weights_cache_provider structure. + /// @param n - size to be reserved. + void* (*reserve_space)(void* context, size_t n); + + /// Looks up packed weights at `ptr` in the cache. If it is found, reuse it. + /// Otherwise, it is added to the cache. Returns the offset to the cache. + /// @param context - The user-specified pointer from xnn_weights_cache_provider structure. + /// @param cache_key - The key used to locate the weights cache entry. + /// @param ptr - pointer pointing to the packed weight. + /// @param size - size of the packed weight. + size_t (*look_up_or_insert)(void* context, const struct xnn_weights_cache_look_up_key* cache_key, void* ptr, size_t size); + + /// Returns whether the cache is finalized. + /// @param context - The user-specified pointer from xnn_weights_cache_provider structure. + bool (*is_finalized)(void* context); + + /// Returns the absolute pointer corresponding to `offset`, where the offset is returned from + /// `look_up` or `get_or_insert`. This function must be called after finalize. + /// @param context - The user-specified pointer from xnn_weights_cache_provider structure. + /// @param offset - offset to the start of internal buffer + void* (*offset_to_addr)(void* context, size_t offset); + + /// Destroy a weights cache object, as well as memory used for the cache. + /// @param context - The user-specified pointer from xnn_weights_cache_provider structure. + enum xnn_status (*delete_cache)(void* context); +}; + +/// Weights cache is a cache for packed weights. It can be reused between runtimes. +typedef struct xnn_weights_cache_provider* xnn_weights_cache_t; + +/// Create a weights cache object specifying the initial size of weights cache (in bytes). +/// +/// @param[in] size - initial capacity of the weights cache (in bytes), i.e. it can hold size bytes without growing. +/// @param weights_cache_out - pointer to the variable that will be initialized to a handle to the weights cache provider +/// upon successful return. Once created, the weights cache provider can be shared between +/// different Runtime objects. +enum xnn_status xnn_create_weights_cache_with_size(size_t size, xnn_weights_cache_t* weights_cache_out); + +enum xnn_status xnn_create_weights_cache(xnn_weights_cache_t* weights_cache_out); + +/// Finalizes the weights cache. The kind of finalization is specified by `finalization_kind`. +/// @param weights_cache - the weights cache object to finalize. +/// @param finalization_kind - the kind of finalization. +enum xnn_status xnn_finalize_weights_cache( + xnn_weights_cache_t weights_cache, + enum xnn_weights_cache_finalization_kind finalization_kind); + +/// Destroy a weights cache object, as well as memory used for the cache. +/// @param weights_cache - the weights cache object to destroy. +enum xnn_status xnn_delete_weights_cache(xnn_weights_cache_t weights_cache); + +typedef struct xnn_workspace* xnn_workspace_t; + +/// Create a workspace object. +/// @param workspace_out - pointer to the variable that will be initialized to a handle to the workspace object upon +/// successful return. Once created, the workspace can be shared between different Runtime +/// objects. +enum xnn_status xnn_create_workspace(xnn_workspace_t* workspace_out); +/// Destroy a workspace object, as well as memory used by the workspace. Object destruction can be deferred until all +/// Runtime objects created with this workspace are destroyed. +/// @param workspace - the workspace object to destroy. +enum xnn_status xnn_release_workspace(xnn_workspace_t workspace); + +/// Runtime is a combination of an execution plan for subgraph Nodes and a memory manager for subgraph Values. +typedef struct xnn_runtime* xnn_runtime_t; + +enum xnn_profile_info { + /// Returns a size_t containing the number of operators. + xnn_profile_info_num_operators, + /// Returns a char[] containing the null character separated names of all operators. + xnn_profile_info_operator_name, + /// Returns a uint64_t[] with the runtimes of all operators in the same order as xnn_profile_info_operator_name. + xnn_profile_info_operator_timing, +}; + +/// Return profile information for all operators. +/// +/// @param runtime - a Runtime object created with @ref xnn_create_runtime, @ref xnn_create_runtime_v2 or +/// @ref xnn_create_runtime_v3. +/// @param param_name - type of profile information required. +/// @param param_value_size - the size in bytes of memory pointed to by param_value. If this is not sufficient then +/// param_value_size_ret will be set to the required size and xnn_status_out_of_memory will be +/// returned. +/// @param param_value - a pointer to memory location where appropriate values for a given param_value will be written. +/// @param param_value_size_ret - returns number of bytes required to write the result if param_value_size is not +/// sufficient. +enum xnn_status xnn_get_runtime_profiling_info(xnn_runtime_t runtime, + enum xnn_profile_info param_name, + size_t param_value_size, + void* param_value, + size_t* param_value_size_ret); + +/// Create a Runtime object from a subgraph. +/// +/// @param subgraph - a Subgraph object with all Values and Nodes that would be handled by the runtime. No Values or +/// Nodes can be added to the runtime once it is constructed. +/// @param weights_cache - a cache for packed weights. The runtime will look up and reuse packed weights in this cache, +/// this will reduce memory allocated for packed weights. +/// @param workspace - a workspace to hold internal tensors. The runtime will allocate space used for internal tensors +/// and track them using workspace. Workspace can be shared and reused across different runtimes. If +/// workspace is NULL, there will be no sharing: each runtime has its own workspace. +/// @param threadpool - the thread pool to be used for parallelisation of computations in the runtime. If the thread +/// pool is NULL, the computation would run on the caller thread without parallelization. +/// @param flags - binary features of the runtime. The only currently supported values are +/// XNN_FLAG_HINT_SPARSE_INFERENCE, XNN_FLAG_HINT_FP16_INFERENCE, XNN_FLAG_FORCE_FP16_INFERENCE, +/// XNN_FLAG_YIELD_WORKERS, and XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER. If XNN_FLAG_YIELD_WORKERS is +/// specified, worker threads would be yielded to the system scheduler after processing the last operator +/// in the Runtime. If XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER is specified, convolution operators will +/// initialize indirection buffers on each inference run using temporary memory in the workspace, instead +/// of initializing persistent indirection buffers once. +/// @param runtime_out - pointer to the variable that will be initialized with a handle to the Runtime object upon +/// successful return. Once constructed, the Runtime object is independent of the Subgraph object +/// used to create it. +enum xnn_status xnn_create_runtime_v4( + xnn_subgraph_t subgraph, + xnn_weights_cache_t weights_cache, + xnn_workspace_t workspace, + pthreadpool_t threadpool, + uint32_t flags, + xnn_runtime_t* runtime_out); + +enum xnn_status xnn_create_runtime_v3( + xnn_subgraph_t subgraph, + xnn_weights_cache_t weights_cache, + pthreadpool_t threadpool, + uint32_t flags, + xnn_runtime_t* runtime_out); + +enum xnn_status xnn_create_runtime_v2( + xnn_subgraph_t subgraph, + pthreadpool_t threadpool, + uint32_t flags, + xnn_runtime_t* runtime_out); + +enum xnn_status xnn_create_runtime( + xnn_subgraph_t subgraph, + xnn_runtime_t* runtime_out); + +struct xnn_external_value { + uint32_t id; + void* data; +}; + +/// Reshape an external value. +/// +/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on +/// the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be +/// created for the Value. +/// @param num_dims - number of dimensions in the shape. +/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL. +/// XNNPACK does not keep any pointers to this array after the function returns. +enum xnn_status xnn_reshape_external_value( + xnn_runtime_t runtime, + uint32_t external_id, + size_t num_dims, + const size_t* dims); + +/// Get the external value shape. +/// +/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on +/// the Subgraph creation. The external ID can not be XNN_INVALID_VALUE_ID. +/// @param num_dims - A valid pointer into which the number of dimensions in the shape will be written. It can not be larger than XNN_MAX_TENSOR_DIMS. +/// @param dims - pointer to an array of @a num_dims shape dimensions. This pointer can't be NULL. It must be large enough to hold +/// at least @a num_dims elements. XNNPACK does not keep any pointers to this array after the function returns. +enum xnn_status xnn_get_external_value_shape( + xnn_runtime_t runtime, + uint32_t external_id, + size_t* num_dims, + size_t* dims); + +/// Reshape the XNNPACK runtime. +/// +/// Propgates the shapes of input tensors through the graph to determine the shapes of intermediate and output tensors. +/// Memory is allocated if required. Output tensor shapes are returned by xnn_get_external_value_shape. +/// +/// @param runtime - a Runtime object created with @ref xnn_create_runtime or @ref xnn_create_runtime_v2. +enum xnn_status xnn_reshape_runtime( + xnn_runtime_t runtime); + +/// Deprecated. Use xnn_reshape_runtime and xnn_setup_runtime_v2. +/// +/// Setup data pointers for external inputs and outputs in a Runtime object and +/// allocate memory. +/// +/// @param runtime - a Runtime object created with @ref xnn_create_runtime or @ref xnn_create_runtime_v2. +/// @param num_external_values - the number of external inputs and outputs specified in this call. This number must +/// match the number of external inputs and outputs in the runtime, i.e. all external +/// inputs and outputs in the runtime must be specified in one call. +/// @param external_values - array with location information for all external inputs and outputs in the runtime. +enum xnn_status xnn_setup_runtime( + xnn_runtime_t runtime, + size_t num_external_values, + const struct xnn_external_value* external_values); + +/// Setup data pointers for external inputs and outputs in a Runtime object. +/// Should be called after xnn_reshape_runtime. +/// +/// @param runtime - a Runtime object created with @ref xnn_create_runtime or @ref xnn_create_runtime_v2. +/// @param num_external_values - the number of external inputs and outputs specified in this call. This number must +/// match the number of external inputs and outputs in the runtime, i.e. all external +/// inputs and outputs in the runtime must be specified in one call. +/// @param external_values - array with location information for all external inputs and outputs in the runtime. +enum xnn_status xnn_setup_runtime_v2( + xnn_runtime_t runtime, + size_t num_external_values, + const struct xnn_external_value* external_values); + +/// Execute forward pass for all operators in the runtime. +/// +/// @param runtime - the Runtime object with the execution plan to invoke. +enum xnn_status xnn_invoke_runtime( + xnn_runtime_t runtime); + +/// Destroy a Runtime object, as well as operators and memory associated with it. +/// +/// @param runtime - the Runtime object to destroy. +enum xnn_status xnn_delete_runtime( + xnn_runtime_t runtime); + +typedef struct xnn_operator* xnn_operator_t; + +enum xnn_status xnn_run_operator( + xnn_operator_t op, + pthreadpool_t threadpool); + +enum xnn_status xnn_delete_operator( + xnn_operator_t op); + + +/// Operator API: +/// - create operator will create and populate a xnn_operator_t +/// - reshape operator will update fields in xnn_operator_t with shape/dimensions and parallelization information +/// - setup operator will update pointers to input and outputs +/// Each supported operator must have a create, reshape, and setup function. (Optionally a run function.) +/// Operators listed below are in alphabetical order by operator name; within each operator, we sort alphabetically by +/// data layout and type. We also group create, reshape, setup (and optionally run) functions of each operator together. + +enum xnn_status xnn_create_abs_nc_f16( + uint32_t flags, + xnn_operator_t* abs_op_out); + +enum xnn_status xnn_reshape_abs_nc_f16( + xnn_operator_t abs_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_abs_nc_f16( + xnn_operator_t abs_op, + const void* input, + void* output); + +enum xnn_status xnn_create_abs_nc_f32( + uint32_t flags, + xnn_operator_t* abs_op_out); + +enum xnn_status xnn_reshape_abs_nc_f32( + xnn_operator_t abs_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_abs_nc_f32( + xnn_operator_t abs_op, + const float* input, + float* output); + +enum xnn_status xnn_run_abs_nc_f32( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const float* input, + float* output, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_add_nd_f16( + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* add_op_out); + +enum xnn_status xnn_reshape_add_nd_f16( + xnn_operator_t add_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_add_nd_f16( + xnn_operator_t add_op, + const void* input1, + const void* input2, + void* output); + +enum xnn_status xnn_create_add_nd_f32( + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* add_op_out); + +enum xnn_status xnn_reshape_add_nd_f32( + xnn_operator_t add_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_add_nd_f32( + xnn_operator_t add_op, + const float* input1, + const float* input2, + float* output); + +enum xnn_status xnn_run_add_nd_f32( + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + const float* input1, + const float* input2, + float* output, + float output_min, + float output_max, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_add_nd_qs8( + int8_t input1_zero_point, + float input1_scale, + int8_t input2_zero_point, + float input2_scale, + int8_t output_zero_point, + float output_scale, + int8_t output_min, + int8_t output_max, + uint32_t flags, + xnn_operator_t* add_op_out); + +enum xnn_status xnn_reshape_add_nd_qs8( + xnn_operator_t add_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_add_nd_qs8( + xnn_operator_t add_op, + const int8_t* input1, + const int8_t* input2, + int8_t* output); + +enum xnn_status xnn_run_add_nd_qs8( + size_t num_input1_dims, + const size_t* input1_shape, + int8_t input1_zero_point, + float input1_scale, + size_t num_input2_dims, + const size_t* input2_shape, + int8_t input2_zero_point, + float input2_scale, + const int8_t* input1, + const int8_t* input2, + int8_t* output, + int8_t output_zero_point, + float output_scale, + int8_t output_min, + int8_t output_max, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_add_nd_qu8( + uint8_t input1_zero_point, + float input1_scale, + uint8_t input2_zero_point, + float input2_scale, + uint8_t output_zero_point, + float output_scale, + uint8_t output_min, + uint8_t output_max, + uint32_t flags, + xnn_operator_t* add_op_out); + +enum xnn_status xnn_reshape_add_nd_qu8( + xnn_operator_t add_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_add_nd_qu8( + xnn_operator_t add_op, + const uint8_t* input1, + const uint8_t* input2, + uint8_t* output); + +enum xnn_status xnn_run_add_nd_qu8( + size_t num_input1_dims, + const size_t* input1_shape, + uint8_t input1_zero_point, + float input1_scale, + size_t num_input2_dims, + const size_t* input2_shape, + uint8_t input2_zero_point, + float input2_scale, + const uint8_t* input1, + const uint8_t* input2, + uint8_t* output, + uint8_t output_zero_point, + float output_scale, + uint8_t output_min, + uint8_t output_max, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_argmax_pooling2d_nhwc_f32( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t pooling_height, + uint32_t pooling_width, + uint32_t flags, + xnn_operator_t* argmax_pooling_op_out); + +enum xnn_status xnn_reshape_argmax_pooling2d_nhwc_f32( + xnn_operator_t argmax_pooling_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t channels, + size_t input_pixel_stride, + size_t output_pixel_stride, + size_t* workspace_size, + size_t* workspace_alignment, + size_t* output_height_out, + size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_argmax_pooling2d_nhwc_f32( + xnn_operator_t argmax_pooling_op, + void* workspace, + const float* input, + float* output, + uint32_t* index); + +enum xnn_status xnn_create_average_pooling2d_nhwc_f16( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t pooling_height, + uint32_t pooling_width, + uint32_t stride_height, + uint32_t stride_width, + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* average_pooling_op_out); + +enum xnn_status xnn_reshape_average_pooling2d_nhwc_f16( + xnn_operator_t average_pooling_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t channels, + size_t input_pixel_stride, + size_t output_pixel_stride, + size_t* workspace_size, + size_t* workspace_alignment, + size_t* output_height_out, + size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_average_pooling2d_nhwc_f16( + xnn_operator_t average_pooling_op, + void* workspace, + const void* input, + void* output); + +enum xnn_status xnn_create_average_pooling2d_nhwc_f32( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t pooling_height, + uint32_t pooling_width, + uint32_t stride_height, + uint32_t stride_width, + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* average_pooling_op_out); + +enum xnn_status xnn_reshape_average_pooling2d_nhwc_f32( + xnn_operator_t average_pooling_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t channels, + size_t input_pixel_stride, + size_t output_pixel_stride, + size_t* workspace_size, + size_t* workspace_alignment, + size_t* output_height_out, + size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_average_pooling2d_nhwc_f32( + xnn_operator_t average_pooling_op, + void* workspace, + const float* input, + float* output); + +enum xnn_status xnn_create_average_pooling2d_nhwc_qu8( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t pooling_height, + uint32_t pooling_width, + uint32_t stride_height, + uint32_t stride_width, + uint8_t input_zero_point, + float input_scale, + uint8_t output_zero_point, + float output_scale, + uint8_t output_min, + uint8_t output_max, + uint32_t flags, + xnn_operator_t* average_pooling_op_out); + +enum xnn_status xnn_reshape_average_pooling2d_nhwc_qu8( + xnn_operator_t average_pooling_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t channels, + size_t input_pixel_stride, + size_t output_pixel_stride, + size_t* workspace_size, + size_t* workspace_alignment, + size_t* output_height_out, + size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_average_pooling2d_nhwc_qu8( + xnn_operator_t average_pooling_op, + void* workspace, + const uint8_t* input, + uint8_t* output); + +enum xnn_status xnn_create_bankers_rounding_nc_f16( + uint32_t flags, + xnn_operator_t* rounding_op_out); + +enum xnn_status xnn_reshape_bankers_rounding_nc_f16( + xnn_operator_t rounding_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_bankers_rounding_nc_f16( + xnn_operator_t rounding_op, + const void* input, + void* output); + +enum xnn_status xnn_create_bankers_rounding_nc_f32( + uint32_t flags, + xnn_operator_t* rounding_op_out); + +enum xnn_status xnn_reshape_bankers_rounding_nc_f32( + xnn_operator_t rounding_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_bankers_rounding_nc_f32( + xnn_operator_t rounding_op, + const float* input, + float* output); + +enum xnn_status xnn_run_bankers_rounding_nc_f32( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const float* input, + float* output, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_batch_matrix_multiply_nc_f16( + uint32_t flags, + xnn_operator_t* batch_matrix_multiply_op); + +enum xnn_status xnn_reshape_batch_matrix_multiply_nc_f16( + xnn_operator_t batch_matrix_multiply_op, + size_t batch_size, + size_t m, + size_t k, + size_t n, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_batch_matrix_multiply_nc_f16( + xnn_operator_t batch_matrix_multiply_op, + void* workspace, + const void* lhs_input, + const void* rhs_input, + void* output); + +enum xnn_status xnn_create_batch_matrix_multiply_nc_f32( + uint32_t flags, + xnn_operator_t* batch_matrix_multiply_op); + +enum xnn_status xnn_reshape_batch_matrix_multiply_nc_f32( + xnn_operator_t batch_matrix_multiply_op, + size_t batch_size, + size_t m, + size_t k, + size_t n, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_batch_matrix_multiply_nc_f32( + xnn_operator_t batch_matrix_multiply_op, + void* workspace, + const float* lhs_input, + const float* rhs_input, + float* output); + +enum xnn_status xnn_create_ceiling_nc_f16( + uint32_t flags, + xnn_operator_t* ceiling_op_out); + +enum xnn_status xnn_reshape_ceiling_nc_f16( + xnn_operator_t ceiling_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_ceiling_nc_f16( + xnn_operator_t ceiling_op, + const void* input, + void* output); + +enum xnn_status xnn_create_ceiling_nc_f32( + uint32_t flags, + xnn_operator_t* ceiling_op_out); + +enum xnn_status xnn_run_ceiling_nc_f32( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const float* input, + float* output, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_reshape_ceiling_nc_f32( + xnn_operator_t ceiling_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_ceiling_nc_f32( + xnn_operator_t ceiling_op, + const float* input, + float* output); + +enum xnn_status xnn_create_channel_shuffle_nc_x8( + size_t groups, + size_t group_channels, + size_t input_stride, + size_t output_stride, + uint32_t flags, + xnn_operator_t* channel_shuffle_op_out); + +enum xnn_status xnn_reshape_channel_shuffle_nc_x8( + xnn_operator_t channel_shuffle_op, + size_t batch_size, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_channel_shuffle_nc_x8( + xnn_operator_t channel_shuffle_op, + const void* input, + void* output); + +enum xnn_status xnn_create_channel_shuffle_nc_x32( + size_t groups, + size_t group_channels, + size_t input_stride, + size_t output_stride, + uint32_t flags, + xnn_operator_t* channel_shuffle_op_out); + +enum xnn_status xnn_reshape_channel_shuffle_nc_x32( + xnn_operator_t channel_shuffle_op, + size_t batch_size, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_channel_shuffle_nc_x32( + xnn_operator_t channel_shuffle_op, + const void* input, + void* output); + +enum xnn_status xnn_create_clamp_nc_f16( + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* clamp_op_out); + +enum xnn_status xnn_reshape_clamp_nc_f16( + xnn_operator_t clamp_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_clamp_nc_f16( + xnn_operator_t clamp_op, + const void* input, + void* output); + +enum xnn_status xnn_create_clamp_nc_f32( + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* clamp_op_out); + +enum xnn_status xnn_reshape_clamp_nc_f32( + xnn_operator_t clamp_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_clamp_nc_f32( + xnn_operator_t clamp_op, + const float* input, + float* output); + +enum xnn_status xnn_run_clamp_nc_f32( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const float* input, + float* output, + float output_min, + float output_max, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_clamp_nc_s8( + int8_t output_min, + int8_t output_max, + uint32_t flags, + xnn_operator_t* clamp_op_out); + +enum xnn_status xnn_reshape_clamp_nc_s8( + xnn_operator_t clamp_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_clamp_nc_s8( + xnn_operator_t clamp_op, + const int8_t* input, + int8_t* output); + +enum xnn_status xnn_create_clamp_nc_u8( + uint8_t output_min, + uint8_t output_max, + uint32_t flags, + xnn_operator_t* clamp_op_out); + +enum xnn_status xnn_reshape_clamp_nc_u8( + xnn_operator_t clamp_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_clamp_nc_u8( + xnn_operator_t clamp_op, + const uint8_t* input, + uint8_t* output); + +enum xnn_status xnn_create_constant_pad_nd_x8( + const void* padding_value, + uint32_t flags, + xnn_operator_t* constant_pad_op_out); + +enum xnn_status xnn_reshape_constant_pad_nd_x8( + xnn_operator_t constant_pad_op, + size_t num_dims, + const size_t* input_shape, + const size_t* pre_padding, + const size_t* post_padding, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_constant_pad_nd_x8( + xnn_operator_t constant_pad_op, + const void* input, + void* output); + +enum xnn_status xnn_run_constant_pad_nd_x8( + uint32_t flags, + size_t num_dims, + const size_t* input_shape, + const size_t* pre_paddings, + const size_t* post_paddings, + const void* input, + void* output, + const void* padding_value, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_constant_pad_nd_x16( + const void* padding_value, + uint32_t flags, + xnn_operator_t* constant_pad_op_out); + +enum xnn_status xnn_reshape_constant_pad_nd_x16( + xnn_operator_t constant_pad_op, + size_t num_dims, + const size_t* input_shape, + const size_t* pre_padding, + const size_t* post_padding, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_constant_pad_nd_x16( + xnn_operator_t constant_pad_op, + const void* input, + void* output); + +enum xnn_status xnn_run_constant_pad_nd_x16( + uint32_t flags, + size_t num_dims, + const size_t* input_shape, + const size_t* pre_paddings, + const size_t* post_paddings, + const void* input, + void* output, + const void* padding_value, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_constant_pad_nd_x32( + const void* padding_value, + uint32_t flags, + xnn_operator_t* constant_pad_op_out); + +enum xnn_status xnn_reshape_constant_pad_nd_x32( + xnn_operator_t constant_pad_op, + size_t num_dims, + const size_t* input_shape, + const size_t* pre_padding, + const size_t* post_padding, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_constant_pad_nd_x32( + xnn_operator_t constant_pad_op, + const void* input, + void* output); + +enum xnn_status xnn_run_constant_pad_nd_x32( + uint32_t flags, + size_t num_dims, + const size_t* input_shape, + const size_t* pre_paddings, + const size_t* post_paddings, + const void* input, + void* output, + const void* padding_value, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_convert_nc_f16_f32( + uint32_t flags, + xnn_operator_t* convert_op_out); + +enum xnn_status xnn_reshape_convert_nc_f16_f32( + xnn_operator_t convert_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_convert_nc_f16_f32( + xnn_operator_t convert_op, + const void* input, + float* output); + +enum xnn_status xnn_run_convert_nc_f16_f32( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const void* input, + float* output, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_convert_nc_f16_qd8( + uint32_t flags, + xnn_operator_t* convert_op_out); + +enum xnn_status xnn_reshape_convert_nc_f16_qd8( + xnn_operator_t convert_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +// quantization_params must be padded with at least XNN_EXTRA_QUANTIZATION_PARAMS entries. +enum xnn_status xnn_setup_convert_nc_f16_qd8( + xnn_operator_t convert_op, + const void* input, + int8_t* output, + struct xnn_dynamic_quantization_params* quantization_params); + +enum xnn_status xnn_create_convert_nc_f32_qd8( + uint32_t flags, + xnn_operator_t* convert_op_out); + +enum xnn_status xnn_reshape_convert_nc_f32_qd8( + xnn_operator_t convert_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +// quantization_params must be padded with at least XNN_EXTRA_QUANTIZATION_PARAMS entries. +enum xnn_status xnn_setup_convert_nc_f32_qd8( + xnn_operator_t convert_op, + const float* input, + int8_t* output, + struct xnn_dynamic_quantization_params* quantization_params); + +enum xnn_status xnn_create_convert_nc_f32_f16( + uint32_t flags, + xnn_operator_t* convert_op_out); + +enum xnn_status xnn_reshape_convert_nc_f32_f16( + xnn_operator_t convert_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_convert_nc_f32_f16( + xnn_operator_t convert_op, + const float* input, + void* output); + +enum xnn_status xnn_run_convert_nc_f32_f16( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const float* input, + void* output, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_convert_nc_f32_qs8( + float output_scale, + int8_t output_zero_point, + int8_t output_min, + int8_t output_max, + uint32_t flags, + xnn_operator_t* convert_op_out); + +enum xnn_status xnn_reshape_convert_nc_f32_qs8( + xnn_operator_t convert_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_convert_nc_f32_qs8( + xnn_operator_t convert_op, + const float* input, + int8_t* output); + +enum xnn_status xnn_run_convert_nc_f32_qs8( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const float* input, + int8_t* output, + float output_scale, + int8_t output_zero_point, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_convert_nc_f32_qu8( + float output_scale, + uint8_t output_zero_point, + uint8_t output_min, + uint8_t output_max, + uint32_t flags, + xnn_operator_t* convert_op_out); + +enum xnn_status xnn_reshape_convert_nc_f32_qu8( + xnn_operator_t convert_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_convert_nc_f32_qu8( + xnn_operator_t convert_op, + const float* input, + uint8_t* output); + +enum xnn_status xnn_run_convert_nc_f32_qu8( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const float* input, + uint8_t* output, + float output_scale, + uint8_t output_zero_point, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_convert_nc_qs8( + float input_scale, + int8_t input_zero_point, + float output_scale, + int8_t output_zero_point, + uint32_t flags, + xnn_operator_t* convert_op_out); + +enum xnn_status xnn_reshape_convert_nc_qs8( + xnn_operator_t convert_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_convert_nc_qs8( + xnn_operator_t convert_op, + const int8_t* input, + int8_t* output); + +enum xnn_status xnn_create_convert_nc_qs8_f16( + float input_scale, + int8_t input_zero_point, + uint32_t flags, + xnn_operator_t* convert_op_out); + +enum xnn_status xnn_reshape_convert_nc_qs8_f16( + xnn_operator_t convert_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_convert_nc_qs8_f16( + xnn_operator_t convert_op, + const int8_t* input, + void* output); + +enum xnn_status xnn_create_convert_nc_qs8_f32( + float input_scale, + int8_t input_zero_point, + uint32_t flags, + xnn_operator_t* convert_op_out); + +enum xnn_status xnn_reshape_convert_nc_qs8_f32( + xnn_operator_t convert_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_convert_nc_qs8_f32( + xnn_operator_t convert_op, + const int8_t* input, + float* output); + +enum xnn_status xnn_run_convert_nc_qs8_f32( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const int8_t* input, + float* output, + float input_scale, + int8_t input_zero_point, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_convert_nc_qs16_qs8( + float input_scale, + float output_scale, + int8_t output_zero_point, + uint32_t flags, + xnn_operator_t* convert_op_out); + +enum xnn_status xnn_reshape_convert_nc_qs16_qs8( + xnn_operator_t convert_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_convert_nc_qs16_qs8( + xnn_operator_t convert_op, + const int16_t* input, + int8_t* output); + +enum xnn_status xnn_run_convert_nc_qs16_qs8( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const int16_t* input, + int8_t* output, + float input_scale, + float output_scale, + int8_t output_zero_point, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_convert_nc_qu8( + float input_scale, + uint8_t input_zero_point, + float output_scale, + uint8_t output_zero_point, + uint32_t flags, + xnn_operator_t* convert_op_out); + +enum xnn_status xnn_reshape_convert_nc_qu8( + xnn_operator_t convert_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_convert_nc_qu8( + xnn_operator_t convert_op, + const uint8_t* input, + uint8_t* output); + +enum xnn_status xnn_create_convert_nc_qu8_f32( + float input_scale, + uint8_t input_zero_point, + uint32_t flags, + xnn_operator_t* convert_op_out); + +enum xnn_status xnn_reshape_convert_nc_qu8_f32( + xnn_operator_t convert_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_convert_nc_qu8_f32( + xnn_operator_t convert_op, + const uint8_t* input, + float* output); + +enum xnn_status xnn_run_convert_nc_qu8_f32( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const uint8_t* input, + float* output, + float input_scale, + uint8_t input_zero_point, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_convolution2d_nchw_f16( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t kernel_height, + uint32_t kernel_width, + uint32_t subsampling_height, + uint32_t subsampling_width, + uint32_t dilation_height, + uint32_t dilation_width, + uint32_t groups, + size_t group_input_channels, + size_t group_output_channels, + size_t input_channel_stride, + size_t output_channel_stride, + const void* kernel, + const void* bias, + float output_min, + float output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* convolution_op_out); + +enum xnn_status xnn_reshape_convolution2d_nchw_f16( + xnn_operator_t convolution_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t* output_height_out, + size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_convolution2d_nchw_f16( + xnn_operator_t convolution_op, + const void* input, + void* output); + +enum xnn_status xnn_create_convolution2d_nchw_f32( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t kernel_height, + uint32_t kernel_width, + uint32_t subsampling_height, + uint32_t subsampling_width, + uint32_t dilation_height, + uint32_t dilation_width, + uint32_t groups, + size_t group_input_channels, + size_t group_output_channels, + size_t input_channel_stride, + size_t output_channel_stride, + const float* kernel, + const float* bias, + float output_min, + float output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* convolution_op_out); + +enum xnn_status xnn_reshape_convolution2d_nchw_f32( + xnn_operator_t convolution_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t* output_height_out, + size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_convolution2d_nchw_f32( + xnn_operator_t convolution_op, + const float* input, + float* output); + +enum xnn_status xnn_create_convolution2d_nhwc_f16( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t kernel_height, + uint32_t kernel_width, + uint32_t subsampling_height, + uint32_t subsampling_width, + uint32_t dilation_height, + uint32_t dilation_width, + uint32_t groups, + size_t group_input_channels, + size_t group_output_channels, + size_t input_channel_stride, + size_t output_channel_stride, + const void* kernel, + const void* bias, + float output_min, + float output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* convolution_op_out); + +enum xnn_status xnn_reshape_convolution2d_nhwc_f16( + xnn_operator_t convolution_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t* workspace_size, + size_t* workspace_alignment, + size_t* output_height_out, + size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_convolution2d_nhwc_f16( + xnn_operator_t convolution_op, + void* workspace, + const void* input, + void* output); + +enum xnn_status xnn_create_convolution2d_nhwc_f32( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t kernel_height, + uint32_t kernel_width, + uint32_t subsampling_height, + uint32_t subsampling_width, + uint32_t dilation_height, + uint32_t dilation_width, + uint32_t groups, + size_t group_input_channels, + size_t group_output_channels, + size_t input_channel_stride, + size_t output_channel_stride, + const float* kernel, + const float* bias, + float output_min, + float output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* convolution_op_out); + +// Forward declare. +struct xnn_post_operation; + +/// Create a convolution operator with a number of post operations. The +/// convolution operator created using this function does not have output_min +/// and output_max. The list of operators in post_operations will be applied in +/// order. Convolution with post operations is only supported on JIT platforms +/// and when JIT is enabled. +enum xnn_status xnn_create_fused_convolution2d_nhwc_f32( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t kernel_height, + uint32_t kernel_width, + uint32_t subsampling_height, + uint32_t subsampling_width, + uint32_t dilation_height, + uint32_t dilation_width, + uint32_t groups, + size_t group_input_channels, + size_t group_output_channels, + size_t input_channel_stride, + size_t output_channel_stride, + const float* kernel, + const float* bias, + size_t num_post_operations, + struct xnn_post_operation* post_operations, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* convolution_op_out); + +enum xnn_status xnn_reshape_convolution2d_nhwc_f32( + xnn_operator_t convolution_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t* workspace_size, + size_t* workspace_alignment, + size_t* output_height_out, + size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_convolution2d_nhwc_f32( + xnn_operator_t convolution_op, + void* workspace, + const float* input, + float* output); + +enum xnn_status xnn_create_convolution2d_nhwc_qd8_f16_qc8w( + uint32_t input_padding_top, uint32_t input_padding_right, + uint32_t input_padding_bottom, uint32_t input_padding_left, + uint32_t kernel_height, uint32_t kernel_width, uint32_t subsampling_height, + uint32_t subsampling_width, uint32_t dilation_height, + uint32_t dilation_width, uint32_t groups, size_t group_input_channels, + size_t group_output_channels, size_t input_channel_stride, + size_t output_channel_stride, const float* kernel_scale, + const int8_t* kernel, const float* bias, float output_min, float output_max, + uint32_t flags, xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, xnn_operator_t* convolution_op_out); + +enum xnn_status xnn_create_convolution2d_nhwc_qd8_f32_qc8w( + uint32_t input_padding_top, uint32_t input_padding_right, + uint32_t input_padding_bottom, uint32_t input_padding_left, + uint32_t kernel_height, uint32_t kernel_width, uint32_t subsampling_height, + uint32_t subsampling_width, uint32_t dilation_height, + uint32_t dilation_width, uint32_t groups, size_t group_input_channels, + size_t group_output_channels, size_t input_channel_stride, + size_t output_channel_stride, const float* kernel_scale, + const int8_t* kernel, const float* bias, float output_min, float output_max, + uint32_t flags, xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, xnn_operator_t* convolution_op_out); + +enum xnn_status xnn_create_convolution2d_nhwc_qs8( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t kernel_height, + uint32_t kernel_width, + uint32_t subsampling_height, + uint32_t subsampling_width, + uint32_t dilation_height, + uint32_t dilation_width, + uint32_t groups, + size_t group_input_channels, + size_t group_output_channels, + size_t input_channel_stride, + size_t output_channel_stride, + int8_t input_zero_point, + float input_scale, + float kernel_scale, + const int8_t* kernel, + const int32_t* bias, + int8_t output_zero_point, + float output_scale, + int8_t output_min, + int8_t output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* convolution_op_out); + +enum xnn_status xnn_reshape_convolution2d_nhwc_qd8_f16_qc8w( + xnn_operator_t convolution_op, size_t batch_size, size_t input_height, + size_t input_width, size_t* workspace_size, size_t* workspace_alignment, + size_t* output_height_out, size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_reshape_convolution2d_nhwc_qd8_f32_qc8w( + xnn_operator_t convolution_op, size_t batch_size, size_t input_height, + size_t input_width, size_t* workspace_size, size_t* workspace_alignment, + size_t* output_height_out, size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_reshape_convolution2d_nhwc_qs8( + xnn_operator_t convolution_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t* workspace_size, + size_t* workspace_alignment, + size_t* output_height_out, + size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_convolution2d_nhwc_qd8_f16_qc8w( + xnn_operator_t convolution_op, void* workspace, const int8_t* input, + void* output, + const struct xnn_dynamic_quantization_params* quantization_params); + +enum xnn_status xnn_setup_convolution2d_nhwc_qd8_f32_qc8w( + xnn_operator_t convolution_op, void* workspace, const int8_t* input, + float* output, + const struct xnn_dynamic_quantization_params* quantization_params); + +enum xnn_status xnn_setup_convolution2d_nhwc_qs8( + xnn_operator_t convolution_op, + void* workspace, + const int8_t* input, + int8_t* output); + +enum xnn_status xnn_create_convolution2d_nhwc_qs8_qc8w( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t kernel_height, + uint32_t kernel_width, + uint32_t subsampling_height, + uint32_t subsampling_width, + uint32_t dilation_height, + uint32_t dilation_width, + uint32_t groups, + size_t group_input_channels, + size_t group_output_channels, + size_t input_channel_stride, + size_t output_channel_stride, + int8_t input_zero_point, + float input_scale, + const float* kernel_scale, + const int8_t* kernel, + const int32_t* bias, + int8_t output_zero_point, + float output_scale, + int8_t output_min, + int8_t output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* convolution_op_out); + +enum xnn_status xnn_reshape_convolution2d_nhwc_qs8_qc8w( + xnn_operator_t convolution_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t* workspace_size, + size_t* workspace_alignment, + size_t* output_height_out, + size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_convolution2d_nhwc_qs8_qc8w( + xnn_operator_t convolution_op, + void* workspace, + const int8_t* input, + int8_t* output); + +enum xnn_status xnn_create_convolution2d_nhwc_qu8( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t kernel_height, + uint32_t kernel_width, + uint32_t subsampling_height, + uint32_t subsampling_width, + uint32_t dilation_height, + uint32_t dilation_width, + uint32_t groups, + size_t group_input_channels, + size_t group_output_channels, + size_t input_channel_stride, + size_t output_channel_stride, + uint8_t input_zero_point, + float input_scale, + uint8_t kernel_zero_point, + float kernel_scale, + const uint8_t* kernel, + const int32_t* bias, + uint8_t output_zero_point, + float output_scale, + uint8_t output_min, + uint8_t output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* convolution_op_out); + +enum xnn_status xnn_reshape_convolution2d_nhwc_qu8( + xnn_operator_t convolution_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t* workspace_size, + size_t* workspace_alignment, + size_t* output_height_out, + size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_convolution2d_nhwc_qu8( + xnn_operator_t convolution_op, + void* workspace, + const uint8_t* input, + uint8_t* output); + +enum xnn_status xnn_create_copy_nc_x8( + uint32_t flags, + xnn_operator_t* copy_op_out); + +enum xnn_status xnn_reshape_copy_nc_x8( + xnn_operator_t copy_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_copy_nc_x8( + xnn_operator_t copy_op, + const void* input, + void* output); + +enum xnn_status xnn_create_copy_nc_x16( + uint32_t flags, + xnn_operator_t* copy_op_out); + +enum xnn_status xnn_reshape_copy_nc_x16( + xnn_operator_t copy_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_copy_nc_x16( + xnn_operator_t copy_op, + const void* input, + void* output); + +enum xnn_status xnn_create_copy_nc_x32( + uint32_t flags, + xnn_operator_t* copy_op_out); + +enum xnn_status xnn_reshape_copy_nc_x32( + xnn_operator_t copy_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_copy_nc_x32( + xnn_operator_t copy_op, + const void* input, + void* output); + +enum xnn_status xnn_run_copy_nc_x32( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const uint32_t* input, + uint32_t* output, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_deconvolution2d_nhwc_f16( + uint32_t output_padding_top, + uint32_t output_padding_right, + uint32_t output_padding_bottom, + uint32_t output_padding_left, + uint32_t kernel_height, + uint32_t kernel_width, + uint32_t stride_height, + uint32_t stride_width, + uint32_t dilation_height, + uint32_t dilation_width, + uint32_t groups, + size_t group_input_channels, + size_t group_output_channels, + size_t input_pixel_stride, + size_t output_pixel_stride, + const void* kernel, + const void* bias, + float output_min, + float output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* deconvolution_op_out); + +enum xnn_status xnn_reshape_deconvolution2d_nhwc_f16( + xnn_operator_t deconvolution_op, + size_t batch_size, + size_t input_height, + size_t input_width, + uint32_t adjustment_height, + uint32_t adjustment_width, + size_t* output_height_out, + size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_deconvolution2d_nhwc_f16( + xnn_operator_t deconvolution_op, + const void* input, + void* output); + +enum xnn_status xnn_create_deconvolution2d_nhwc_f32( + uint32_t output_padding_top, + uint32_t output_padding_right, + uint32_t output_padding_bottom, + uint32_t output_padding_left, + uint32_t kernel_height, + uint32_t kernel_width, + uint32_t stride_height, + uint32_t stride_width, + uint32_t dilation_height, + uint32_t dilation_width, + uint32_t groups, + size_t group_input_channels, + size_t group_output_channels, + size_t input_pixel_stride, + size_t output_pixel_stride, + const float* kernel, + const float* bias, + float output_min, + float output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* deconvolution_op_out); + +enum xnn_status xnn_reshape_deconvolution2d_nhwc_f32( + xnn_operator_t deconvolution_op, + size_t batch_size, + size_t input_height, + size_t input_width, + uint32_t adjustment_height, + uint32_t adjustment_width, + size_t* output_height_out, + size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_deconvolution2d_nhwc_f32( + xnn_operator_t deconvolution_op, + const float* input, + float* output); + +enum xnn_status xnn_create_deconvolution2d_nhwc_qs8( + uint32_t output_padding_top, + uint32_t output_padding_right, + uint32_t output_padding_bottom, + uint32_t output_padding_left, + uint32_t kernel_height, + uint32_t kernel_width, + uint32_t stride_height, + uint32_t stride_width, + uint32_t dilation_height, + uint32_t dilation_width, + uint32_t groups, + size_t group_input_channels, + size_t group_output_channels, + size_t input_pixel_stride, + size_t output_pixel_stride, + int8_t input_zero_point, + float input_scale, + float kernel_scale, + const int8_t* kernel, + const int32_t* bias, + int8_t output_zero_point, + float output_scale, + int8_t output_min, + int8_t output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* deconvolution_op_out); + +enum xnn_status xnn_reshape_deconvolution2d_nhwc_qs8( + xnn_operator_t deconvolution_op, + size_t batch_size, + size_t input_height, + size_t input_width, + uint32_t adjustment_height, + uint32_t adjustment_width, + size_t* output_height_out, + size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_deconvolution2d_nhwc_qs8( + xnn_operator_t deconvolution_op, + const int8_t* input, + int8_t* output); + +enum xnn_status xnn_create_deconvolution2d_nhwc_qu8( + uint32_t output_padding_top, + uint32_t output_padding_right, + uint32_t output_padding_bottom, + uint32_t output_padding_left, + uint32_t kernel_height, + uint32_t kernel_width, + uint32_t stride_height, + uint32_t stride_width, + uint32_t dilation_height, + uint32_t dilation_width, + uint32_t groups, + size_t group_input_channels, + size_t group_output_channels, + size_t input_pixel_stride, + size_t output_pixel_stride, + uint8_t input_zero_point, + float input_scale, + uint8_t kernel_zero_point, + float kernel_scale, + const uint8_t* kernel, + const int32_t* bias, + uint8_t output_zero_point, + float output_scale, + uint8_t output_min, + uint8_t output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* deconvolution_op_out); + +enum xnn_status xnn_reshape_deconvolution2d_nhwc_qu8( + xnn_operator_t deconvolution_op, + size_t batch_size, + size_t input_height, + size_t input_width, + uint32_t adjustment_height, + uint32_t adjustment_width, + size_t* output_height_out, + size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_deconvolution2d_nhwc_qu8( + xnn_operator_t deconvolution_op, + const uint8_t* input, + uint8_t* output); + +enum xnn_status xnn_create_depth_to_space_nchw2nhwc_x16( + uint32_t block_size, + uint32_t flags, + xnn_operator_t* depth_to_space_op_out); + +enum xnn_status xnn_reshape_depth_to_space_nchw2nhwc_x16( + xnn_operator_t depth_to_space_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t input_channels, + size_t* output_height_out, + size_t* output_width_out, + size_t* output_channels_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_depth_to_space_nchw2nhwc_x16( + xnn_operator_t depth_to_space_op, + const void* input, + void* output); + +enum xnn_status xnn_create_depth_to_space_nchw2nhwc_x32( + uint32_t block_size, + uint32_t flags, + xnn_operator_t* depth_to_space_op_out); + +enum xnn_status xnn_reshape_depth_to_space_nchw2nhwc_x32( + xnn_operator_t depth_to_space_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t input_channels, + size_t* output_height_out, + size_t* output_width_out, + size_t* output_channels_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_depth_to_space_nchw2nhwc_x32( + xnn_operator_t depth_to_space_op, + const void* input, + void* output); + +enum xnn_status xnn_create_depth_to_space_nhwc_x8( + uint32_t block_size, + uint32_t flags, + xnn_operator_t* depth_to_space_op_out); + +enum xnn_status xnn_reshape_depth_to_space_nhwc_x8( + xnn_operator_t depth_to_space_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t input_channels, + size_t* output_height_out, + size_t* output_width_out, + size_t* output_channels_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_depth_to_space_nhwc_x8( + xnn_operator_t depth_to_space_op, + const void* input, + void* output); + +enum xnn_status xnn_create_depth_to_space_nhwc_x16( + uint32_t block_size, + uint32_t flags, + xnn_operator_t* depth_to_space_op_out); + +enum xnn_status xnn_reshape_depth_to_space_nhwc_x16( + xnn_operator_t depth_to_space_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t input_channels, + size_t* output_height_out, + size_t* output_width_out, + size_t* output_channels_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_depth_to_space_nhwc_x16( + xnn_operator_t depth_to_space_op, + const void* input, + void* output); + +enum xnn_status xnn_create_depth_to_space_nhwc_x32( + uint32_t block_size, + uint32_t flags, + xnn_operator_t* depth_to_space_op_out); + +enum xnn_status xnn_reshape_depth_to_space_nhwc_x32( + xnn_operator_t depth_to_space_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t input_channels, + size_t* output_height_out, + size_t* output_width_out, + size_t* output_channels_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_depth_to_space_nhwc_x32( + xnn_operator_t depth_to_space_op, + const void* input, + void* output); + +enum xnn_status xnn_create_divide_nd_f16( + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* divide_op_out); + +enum xnn_status xnn_reshape_divide_nd_f16( + xnn_operator_t divide_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_divide_nd_f16( + xnn_operator_t divide_op, + const void* input1, + const void* input2, + void* output); + +enum xnn_status xnn_create_divide_nd_f32( + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* divide_op_out); + +enum xnn_status xnn_reshape_divide_nd_f32( + xnn_operator_t divide_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_divide_nd_f32( + xnn_operator_t divide_op, + const float* input1, + const float* input2, + float* output); + +enum xnn_status xnn_run_divide_nd_f32( + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + const float* input1, + const float* input2, + float* output, + float output_min, + float output_max, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_dynamic_fully_connected_nc_f16( + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* dynamic_fully_connected_op_out); + +enum xnn_status xnn_reshape_dynamic_fully_connected_nc_f16( + xnn_operator_t dynamic_fully_connected_op, + size_t batch_size, + size_t input_channels, + size_t output_channels, + size_t input_stride, + size_t output_stride, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_dynamic_fully_connected_nc_f16( + xnn_operator_t dynamic_fully_connected_op, + void* workspace, + const void* input, + const void* kernel, + const void* bias, + void* output); + +enum xnn_status xnn_create_dynamic_fully_connected_nc_f32( + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* dynamic_fully_connected_op_out); + +enum xnn_status xnn_reshape_dynamic_fully_connected_nc_f32( + xnn_operator_t dynamic_fully_connected_op, + size_t batch_size, + size_t input_channels, + size_t output_channels, + size_t input_stride, + size_t output_stride, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_dynamic_fully_connected_nc_f32( + xnn_operator_t dynamic_fully_connected_op, + void* workspace, + const float* input, + const float* kernel, + const float* bias, + float* output); + +enum xnn_status xnn_create_elu_nc_f16( + float alpha, + uint32_t flags, + xnn_operator_t* elu_op_out); + +enum xnn_status xnn_reshape_elu_nc_f16( + xnn_operator_t elu_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_elu_nc_f16( + xnn_operator_t elu_op, + const void* input, + void* output); + +enum xnn_status xnn_create_elu_nc_f32( + float alpha, + uint32_t flags, + xnn_operator_t* elu_op_out); + +enum xnn_status xnn_reshape_elu_nc_f32( + xnn_operator_t elu_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_elu_nc_f32( + xnn_operator_t elu_op, + const float* input, + float* output); + +enum xnn_status xnn_run_elu_nc_f32( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const float* input, + float* output, + float alpha, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_elu_nc_qs8( + float alpha, + int8_t input_zero_point, + float input_scale, + int8_t output_zero_point, + float output_scale, + int8_t output_min, + int8_t output_max, + uint32_t flags, + xnn_operator_t* elu_op_out); + +enum xnn_status xnn_reshape_elu_nc_qs8( + xnn_operator_t elu_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_elu_nc_qs8( + xnn_operator_t elu_op, + const int8_t* input, + int8_t* output); + +enum xnn_status xnn_create_floor_nc_f16( + uint32_t flags, + xnn_operator_t* floor_op_out); + +enum xnn_status xnn_reshape_floor_nc_f16( + xnn_operator_t floor_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_floor_nc_f16( + xnn_operator_t floor_op, + const void* input, + void* output); + +enum xnn_status xnn_create_floor_nc_f32( + uint32_t flags, + xnn_operator_t* floor_op_out); + +enum xnn_status xnn_reshape_floor_nc_f32( + xnn_operator_t floor_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_floor_nc_f32( + xnn_operator_t floor_op, + const float* input, + float* output); + +enum xnn_status xnn_run_floor_nc_f32( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const float* input, + float* output, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_fully_connected_nc_f16( + size_t input_channels, + size_t output_channels, + size_t input_stride, + size_t output_stride, + const void* kernel, + const void* bias, + float output_min, + float output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* fully_connected_op_out); + +enum xnn_status xnn_reshape_fully_connected_nc_f16( + xnn_operator_t fully_connected_op, + size_t batch_size, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_fully_connected_nc_f16( + xnn_operator_t fully_connected_op, + const void* input, + void* output); + +enum xnn_status xnn_create_fully_connected_nc_f32( + size_t input_channels, + size_t output_channels, + size_t input_stride, + size_t output_stride, + const float* kernel, + const float* bias, + float output_min, + float output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* fully_connected_op_out); + +enum xnn_status xnn_reshape_fully_connected_nc_f32( + xnn_operator_t fully_connected_op, + size_t batch_size, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_fully_connected_nc_f32( + xnn_operator_t fully_connected_op, + const float* input, + float* output); + +enum xnn_status xnn_create_fully_connected_nc_f32_qc4w( + size_t input_channels, + size_t output_channels, + size_t input_stride, + size_t output_stride, + uint8_t kernel_zero_point, + const float* kernel_scale, + const uint8_t* kernel, + const float* bias, + float output_min, + float output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* fully_connected_op_out); + +enum xnn_status xnn_reshape_fully_connected_nc_f32_qc4w( + xnn_operator_t fully_connected_op, + size_t batch_size, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_fully_connected_nc_f32_qc4w( + xnn_operator_t fully_connected_op, + const float* input, + float* output); + +enum xnn_status xnn_create_fully_connected_nc_f32_qc8w( + size_t input_channels, + size_t output_channels, + size_t input_stride, + size_t output_stride, + const float* kernel_scale, + const int8_t* kernel, + const float* bias, + float output_min, + float output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* fully_connected_op_out); + +enum xnn_status xnn_reshape_fully_connected_nc_f32_qc8w( + xnn_operator_t fully_connected_op, + size_t batch_size, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_fully_connected_nc_f32_qc8w( + xnn_operator_t fully_connected_op, + const float* input, + float* output); + +enum xnn_status xnn_create_fully_connected_nc_qd8_f16_qc4w( + size_t input_channels, + size_t output_channels, + size_t input_stride, + size_t output_stride, + uint8_t kernel_zero_point, + const float* kernel_scale, + const void* kernel, + const float* bias, + float output_min, + float output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* fully_connected_op_out); + +enum xnn_status xnn_setup_fully_connected_nc_qd8_f16_qc4w( + xnn_operator_t fully_connected_op, + const int8_t* input, + void* output, + const struct xnn_dynamic_quantization_params* quantization_params); + +enum xnn_status xnn_reshape_fully_connected_nc_qd8_f16_qc4w( + xnn_operator_t fully_connected_op, + size_t batch_size, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_fully_connected_nc_qd8_f32_qc4w( + size_t input_channels, + size_t output_channels, + size_t input_stride, + size_t output_stride, + uint8_t kernel_zero_point, + const float* kernel_scale, + const void* kernel, + const float* bias, + float output_min, + float output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* fully_connected_op_out); + +enum xnn_status xnn_setup_fully_connected_nc_qd8_f32_qc4w( + xnn_operator_t fully_connected_op, + const int8_t* input, + float* output, + const struct xnn_dynamic_quantization_params* quantization_params); + +enum xnn_status xnn_reshape_fully_connected_nc_qd8_f32_qc4w( + xnn_operator_t fully_connected_op, + size_t batch_size, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_fully_connected_nc_qd8_f16_qc8w( + size_t input_channels, + size_t output_channels, + size_t input_stride, + size_t output_stride, + const float* kernel_scale, + const int8_t* kernel, + const float* bias, + float output_min, + float output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* fully_connected_op_out); + +enum xnn_status xnn_setup_fully_connected_nc_qd8_f16_qc8w( + xnn_operator_t fully_connected_op, + const int8_t* input, + void* output, + const struct xnn_dynamic_quantization_params* quantization_params); + +enum xnn_status xnn_reshape_fully_connected_nc_qd8_f16_qc8w( + xnn_operator_t fully_connected_op, + size_t batch_size, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_fully_connected_nc_qd8_f32_qc8w( + size_t input_channels, + size_t output_channels, + size_t input_stride, + size_t output_stride, + const float* kernel_scale, + const int8_t* kernel, + const float* bias, + float output_min, + float output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* fully_connected_op_out); + +enum xnn_status xnn_setup_fully_connected_nc_qd8_f32_qc8w( + xnn_operator_t fully_connected_op, + const int8_t* input, + float* output, + const struct xnn_dynamic_quantization_params* quantization_params); + +enum xnn_status xnn_reshape_fully_connected_nc_qd8_f32_qc8w( + xnn_operator_t fully_connected_op, + size_t batch_size, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_fully_connected_nc_qs8( + size_t input_channels, + size_t output_channels, + size_t input_stride, + size_t output_stride, + int8_t input_zero_point, + float input_scale, + float kernel_scale, + const int8_t* kernel, + const int32_t* bias, + int8_t output_zero_point, + float output_scale, + int8_t output_min, + int8_t output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* fully_connected_op_out); + +enum xnn_status xnn_reshape_fully_connected_nc_qs8( + xnn_operator_t fully_connected_op, + size_t batch_size, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_fully_connected_nc_qs8( + xnn_operator_t fully_connected_op, + const int8_t* input, + int8_t* output); + +enum xnn_status xnn_create_fully_connected_nc_qs8_qc8w( + size_t input_channels, + size_t output_channels, + size_t input_stride, + size_t output_stride, + int8_t input_zero_point, + float input_scale, + const float* kernel_scale, + const int8_t* kernel, + const int32_t* bias, + int8_t output_zero_point, + float output_scale, + int8_t output_min, + int8_t output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* fully_connected_op_out); + +enum xnn_status xnn_reshape_fully_connected_nc_qs8_qc8w( + xnn_operator_t fully_connected_op, + size_t batch_size, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_fully_connected_nc_qs8_qc8w( + xnn_operator_t fully_connected_op, + const int8_t* input, + int8_t* output); + +enum xnn_status xnn_create_fully_connected_nc_qu8( + size_t input_channels, + size_t output_channels, + size_t input_stride, + size_t output_stride, + uint8_t input_zero_point, + float input_scale, + uint8_t kernel_zero_point, + float kernel_scale, + const uint8_t* kernel, + const int32_t* bias, + uint8_t output_zero_point, + float output_scale, + uint8_t output_min, + uint8_t output_max, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* fully_connected_op_out); + +enum xnn_status xnn_reshape_fully_connected_nc_qu8( + xnn_operator_t fully_connected_op, + size_t batch_size, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_fully_connected_nc_qu8( + xnn_operator_t fully_connected_op, + const uint8_t* input, + uint8_t* output); + +enum xnn_status xnn_create_global_average_pooling_ncw_f16( + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* global_average_pooling_op_out); + +enum xnn_status xnn_reshape_global_average_pooling_ncw_f16( + xnn_operator_t global_average_pooling_op, + size_t batch_size, + size_t width, + size_t channels, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_global_average_pooling_ncw_f16( + xnn_operator_t global_average_pooling_op, + const void* input, + void* output); + +enum xnn_status xnn_create_global_average_pooling_ncw_f32( + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* global_average_pooling_op_out); + +enum xnn_status xnn_reshape_global_average_pooling_ncw_f32( + xnn_operator_t global_average_pooling_op, + size_t batch_size, + size_t width, + size_t channels, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_global_average_pooling_ncw_f32( + xnn_operator_t global_average_pooling_op, + const float* input, + float* output); + +enum xnn_status xnn_create_global_average_pooling_nwc_f16( + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* global_average_pooling_op_out); + +enum xnn_status xnn_reshape_global_average_pooling_nwc_f16( + xnn_operator_t global_average_pooling_op, + size_t batch_size, + size_t width, + size_t channels, + size_t input_stride, + size_t output_stride, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_global_average_pooling_nwc_f16( + xnn_operator_t global_average_pooling_op, + void* workspace, + const void* input, + void* output); + +enum xnn_status xnn_create_global_average_pooling_nwc_f32( + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* global_average_pooling_op_out); + +enum xnn_status xnn_reshape_global_average_pooling_nwc_f32( + xnn_operator_t global_average_pooling_op, + size_t batch_size, + size_t width, + size_t channels, + size_t input_stride, + size_t output_stride, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_global_average_pooling_nwc_f32( + xnn_operator_t global_average_pooling_op, + void* workspace, + const float* input, + float* output); + +enum xnn_status xnn_create_global_average_pooling_nwc_qs8( + int8_t input_zero_point, + float input_scale, + int8_t output_zero_point, + float output_scale, + int8_t output_min, + int8_t output_max, + uint32_t flags, + xnn_operator_t* global_average_pooling_op_out); + +enum xnn_status xnn_reshape_global_average_pooling_nwc_qs8( + xnn_operator_t global_average_pooling_op, + size_t batch_size, + size_t width, + size_t channels, + size_t input_stride, + size_t output_stride, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_global_average_pooling_nwc_qs8( + xnn_operator_t global_average_pooling_op, + void* workspace, + const int8_t* input, + int8_t* output); + +enum xnn_status xnn_create_global_average_pooling_nwc_qu8( + uint8_t input_zero_point, + float input_scale, + uint8_t output_zero_point, + float output_scale, + uint8_t output_min, + uint8_t output_max, + uint32_t flags, + xnn_operator_t* global_average_pooling_op_out); + +enum xnn_status xnn_reshape_global_average_pooling_nwc_qu8( + xnn_operator_t global_average_pooling_op, + size_t batch_size, + size_t width, + size_t channels, + size_t input_stride, + size_t output_stride, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_global_average_pooling_nwc_qu8( + xnn_operator_t global_average_pooling_op, + void* workspace, + const uint8_t* input, + uint8_t* output); + +enum xnn_status xnn_create_global_sum_pooling_nwc_f16( + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* global_sum_pooling_op_out); + +enum xnn_status xnn_reshape_global_sum_pooling_nwc_f16( + xnn_operator_t global_sum_pooling_op, + size_t batch_size, + size_t width, + size_t channels, + size_t input_stride, + size_t output_stride, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_global_sum_pooling_nwc_f16( + xnn_operator_t global_sum_pooling_op, + void* workspace, + const void* input, + void* output); + +enum xnn_status xnn_create_global_sum_pooling_nwc_f32( + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* global_sum_pooling_op_out); + +enum xnn_status xnn_reshape_global_sum_pooling_nwc_f32( + xnn_operator_t global_sum_pooling_op, + size_t batch_size, + size_t width, + size_t channels, + size_t input_stride, + size_t output_stride, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_global_sum_pooling_nwc_f32( + xnn_operator_t global_sum_pooling_op, + void* workspace, + const float* input, + float* output); + +enum xnn_status xnn_create_hardswish_nc_f16( + uint32_t flags, + xnn_operator_t* hardswish_op_out); + +enum xnn_status xnn_reshape_hardswish_nc_f16( + xnn_operator_t hardswish_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_hardswish_nc_f16( + xnn_operator_t hardswish_op, + const void* input, + void* output); + +enum xnn_status xnn_create_hardswish_nc_f32( + uint32_t flags, + xnn_operator_t* hardswish_op_out); + +enum xnn_status xnn_reshape_hardswish_nc_f32( + xnn_operator_t hardswish_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_hardswish_nc_f32( + xnn_operator_t hardswish_op, + const float* input, + float* output); + +enum xnn_status xnn_run_hardswish_nc_f32( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const float* input, + float* output, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_leaky_relu_nc_f16( + float negative_slope, + uint32_t flags, + xnn_operator_t* leaky_relu_op_out); + +enum xnn_status xnn_reshape_leaky_relu_nc_f16( + xnn_operator_t leaky_relu_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_leaky_relu_nc_f16( + xnn_operator_t leaky_relu_op, + const void* input, + void* output); + +enum xnn_status xnn_create_leaky_relu_nc_f32( + float negative_slope, + uint32_t flags, + xnn_operator_t* leaky_relu_op_out); + +enum xnn_status xnn_reshape_leaky_relu_nc_f32( + xnn_operator_t leaky_relu_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_leaky_relu_nc_f32( + xnn_operator_t leaky_relu_op, + const float* input, + float* output); + +enum xnn_status xnn_run_leaky_relu_nc_f32( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const float* input, + float* output, + float negative_slope, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_leaky_relu_nc_qs8( + float negative_slope, + int8_t input_zero_point, + float input_scale, + int8_t output_zero_point, + float output_scale, + uint32_t flags, + xnn_operator_t* leaky_relu_op_out); + +enum xnn_status xnn_reshape_leaky_relu_nc_qs8( + xnn_operator_t leaky_relu_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_leaky_relu_nc_qs8( + xnn_operator_t leaky_relu_op, + const int8_t* input, + int8_t* output); + +enum xnn_status xnn_create_leaky_relu_nc_qu8( + float negative_slope, + uint8_t input_zero_point, + float input_scale, + uint8_t output_zero_point, + float output_scale, + uint32_t flags, + xnn_operator_t* leaky_relu_op_out); + +enum xnn_status xnn_reshape_leaky_relu_nc_qu8( + xnn_operator_t leaky_relu_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_leaky_relu_nc_qu8( + xnn_operator_t leaky_relu_op, + const uint8_t* input, + uint8_t* output); + +enum xnn_status xnn_create_max_pooling2d_nhwc_f16( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t pooling_height, + uint32_t pooling_width, + uint32_t stride_height, + uint32_t stride_width, + uint32_t dilation_height, + uint32_t dilation_width, + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* max_pooling_op_out); + +enum xnn_status xnn_reshape_max_pooling2d_nhwc_f16( + xnn_operator_t max_pooling_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t channels, + size_t input_pixel_stride, + size_t output_pixel_stride, + size_t* output_height_out, + size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_max_pooling2d_nhwc_f16( + xnn_operator_t max_pooling_op, + const void* input, + void* output); + +enum xnn_status xnn_create_max_pooling2d_nhwc_f32( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t pooling_height, + uint32_t pooling_width, + uint32_t stride_height, + uint32_t stride_width, + uint32_t dilation_height, + uint32_t dilation_width, + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* max_pooling_op_out); + +enum xnn_status xnn_reshape_max_pooling2d_nhwc_f32( + xnn_operator_t max_pooling_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t channels, + size_t input_pixel_stride, + size_t output_pixel_stride, + size_t* output_height_out, + size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_max_pooling2d_nhwc_f32( + xnn_operator_t max_pooling_op, + const float* input, + float* output); + +enum xnn_status xnn_create_max_pooling2d_nhwc_s8( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t pooling_height, + uint32_t pooling_width, + uint32_t stride_height, + uint32_t stride_width, + uint32_t dilation_height, + uint32_t dilation_width, + int8_t output_min, + int8_t output_max, + uint32_t flags, + xnn_operator_t* max_pooling_op_out); + +enum xnn_status xnn_reshape_max_pooling2d_nhwc_s8( + xnn_operator_t max_pooling_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t channels, + size_t input_pixel_stride, + size_t output_pixel_stride, + size_t* output_height_out, + size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_max_pooling2d_nhwc_s8( + xnn_operator_t max_pooling_op, + const int8_t* input, + int8_t* output); + +enum xnn_status xnn_create_max_pooling2d_nhwc_u8( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t pooling_height, + uint32_t pooling_width, + uint32_t stride_height, + uint32_t stride_width, + uint32_t dilation_height, + uint32_t dilation_width, + uint8_t output_min, + uint8_t output_max, + uint32_t flags, + xnn_operator_t* max_pooling_op_out); + +enum xnn_status xnn_reshape_max_pooling2d_nhwc_u8( + xnn_operator_t max_pooling_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t channels, + size_t input_pixel_stride, + size_t output_pixel_stride, + size_t* output_height_out, + size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_max_pooling2d_nhwc_u8( + xnn_operator_t max_pooling_op, + const uint8_t* input, + uint8_t* output); + +enum xnn_status xnn_create_maximum_nd_f16( + uint32_t flags, + xnn_operator_t* maximum_op_out); + +enum xnn_status xnn_reshape_maximum_nd_f16( + xnn_operator_t maximum_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_maximum_nd_f16( + xnn_operator_t maximum_op, + const void* input1, + const void* input2, + void* output); + +enum xnn_status xnn_create_maximum_nd_f32( + uint32_t flags, + xnn_operator_t* maximum_op_out); + +enum xnn_status xnn_reshape_maximum_nd_f32( + xnn_operator_t maximum_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_maximum_nd_f32( + xnn_operator_t maximum_op, + const float* input1, + const float* input2, + float* output); + +enum xnn_status xnn_run_maximum_nd_f32( + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + const float* input1, + const float* input2, + float* output, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_mean_nd_f16( + uint32_t flags, + xnn_operator_t* mean_op_out); + +enum xnn_status xnn_reshape_mean_nd_f16( + xnn_operator_t mean_op, + size_t num_reduction_axes, + const size_t* reduction_axes, + size_t num_input_dims, + const size_t* input_shape, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_mean_nd_f16( + xnn_operator_t mean_op, + void* workspace, + const void* input, + void* output); + +enum xnn_status xnn_create_mean_nd_f32( + uint32_t flags, + xnn_operator_t* mean_op_out); + +enum xnn_status xnn_reshape_mean_nd_f32( + xnn_operator_t mean_op, + size_t num_reduction_axes, + const size_t* reduction_axes, + size_t num_input_dims, + const size_t* input_shape, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_mean_nd_f32( + xnn_operator_t mean_op, + void* workspace, + const float* input, + float* output); + +enum xnn_status xnn_create_minimum_nd_f16( + uint32_t flags, + xnn_operator_t* minimum_op_out); + +enum xnn_status xnn_reshape_minimum_nd_f16( + xnn_operator_t minimum_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_minimum_nd_f16( + xnn_operator_t minimum_op, + const void* input1, + const void* input2, + void* output); + +enum xnn_status xnn_create_minimum_nd_f32( + uint32_t flags, + xnn_operator_t* minimum_op_out); + +enum xnn_status xnn_reshape_minimum_nd_f32( + xnn_operator_t minimum_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_minimum_nd_f32( + xnn_operator_t minimum_op, + const float* input1, + const float* input2, + float* output); + +enum xnn_status xnn_run_minimum_nd_f32( + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + const float* input1, + const float* input2, + float* output, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_multiply_nd_f16( + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* multiply_op_out); + +enum xnn_status xnn_reshape_multiply_nd_f16( + xnn_operator_t multiply_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_multiply_nd_f16( + xnn_operator_t multiply_op, + const void* input1, + const void* input2, + void* output); + +enum xnn_status xnn_create_multiply_nd_f32( + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* multiply_op_out); + +enum xnn_status xnn_reshape_multiply_nd_f32( + xnn_operator_t multiply_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_multiply_nd_f32( + xnn_operator_t multiply_op, + const float* input1, + const float* input2, + float* output); + +enum xnn_status xnn_run_multiply_nd_f32( + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + const float* input1, + const float* input2, + float* output, + float output_min, + float output_max, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_multiply_nd_qs8( + int8_t input1_zero_point, + float input1_scale, + int8_t input2_zero_point, + float input2_scale, + int8_t output_zero_point, + float output_scale, + int8_t output_min, + int8_t output_max, + uint32_t flags, + xnn_operator_t* multiply_op_out); + +enum xnn_status xnn_reshape_multiply_nd_qs8( + xnn_operator_t multiply_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_multiply_nd_qs8( + xnn_operator_t multiply_op, + const int8_t* input1, + const int8_t* input2, + int8_t* output); + +enum xnn_status xnn_run_multiply_nd_qs8( + size_t num_input1_dims, + const size_t* input1_shape, + int8_t input1_zero_point, + float input1_scale, + size_t num_input2_dims, + const size_t* input2_shape, + int8_t input2_zero_point, + float input2_scale, + const int8_t* input1, + const int8_t* input2, + int8_t* output, + int8_t output_zero_point, + float output_scale, + int8_t output_min, + int8_t output_max, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_multiply_nd_qu8( + uint8_t input1_zero_point, + float input1_scale, + uint8_t input2_zero_point, + float input2_scale, + uint8_t output_zero_point, + float output_scale, + uint8_t output_min, + uint8_t output_max, + uint32_t flags, + xnn_operator_t* multiply_op_out); + +enum xnn_status xnn_reshape_multiply_nd_qu8( + xnn_operator_t multiply_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_multiply_nd_qu8( + xnn_operator_t multiply_op, + const uint8_t* input1, + const uint8_t* input2, + uint8_t* output); + +enum xnn_status xnn_run_multiply_nd_qu8( + size_t num_input1_dims, + const size_t* input1_shape, + uint8_t input1_zero_point, + float input1_scale, + size_t num_input2_dims, + const size_t* input2_shape, + uint8_t input2_zero_point, + float input2_scale, + const uint8_t* input1, + const uint8_t* input2, + uint8_t* output, + uint8_t output_zero_point, + float output_scale, + uint8_t output_min, + uint8_t output_max, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_negate_nc_f16( + uint32_t flags, + xnn_operator_t* negate_op_out); + +enum xnn_status xnn_reshape_negate_nc_f16( + xnn_operator_t negate_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_negate_nc_f16( + xnn_operator_t negate_op, + const void* input, + void* output); + +enum xnn_status xnn_create_negate_nc_f32( + uint32_t flags, + xnn_operator_t* negate_op_out); + +enum xnn_status xnn_reshape_negate_nc_f32( + xnn_operator_t negate_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_negate_nc_f32( + xnn_operator_t negate_op, + const float* input, + float* output); + +enum xnn_status xnn_run_negate_nc_f32( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const float* input, + float* output, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_prelu_nc_f16( + size_t channels, + size_t input_stride, + size_t output_stride, + const void* negative_slope, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* prelu_op_out); + +enum xnn_status xnn_reshape_prelu_nc_f16( + xnn_operator_t prelu_op, + size_t batch_size, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_prelu_nc_f16( + xnn_operator_t prelu_op, + const void* input, + void* output); + +enum xnn_status xnn_create_prelu_nc_f32( + size_t channels, + size_t input_stride, + size_t output_stride, + const float* negative_slope, + uint32_t flags, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* prelu_op_out); + +enum xnn_status xnn_reshape_prelu_nc_f32( + xnn_operator_t prelu_op, + size_t batch_size, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_prelu_nc_f32( + xnn_operator_t prelu_op, + const float* input, + float* output); + +enum xnn_status xnn_create_resize_bilinear2d_nchw_f32( + size_t output_height, + size_t output_width, + uint32_t flags, + xnn_operator_t* resize_op_out); + +enum xnn_status xnn_reshape_resize_bilinear2d_nchw_f32( + xnn_operator_t resize_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t channels, + size_t input_pixel_stride, + size_t output_pixel_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_resize_bilinear2d_nchw_f32( + xnn_operator_t resize_op, + const float* input, + float* output); + +enum xnn_status xnn_create_resize_bilinear2d_nchw_f16( + size_t output_height, + size_t output_width, + uint32_t flags, + xnn_operator_t* resize_op_out); + +enum xnn_status xnn_reshape_resize_bilinear2d_nchw_f16( + xnn_operator_t resize_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t channels, + size_t input_pixel_stride, + size_t output_pixel_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_resize_bilinear2d_nchw_f16( + xnn_operator_t resize_op, + const void* input, + void* output); + +enum xnn_status xnn_create_resize_bilinear2d_nhwc_f16( + size_t output_height, + size_t output_width, + uint32_t flags, + xnn_operator_t* resize_op_out); + +enum xnn_status xnn_reshape_resize_bilinear2d_nhwc_f16( + xnn_operator_t resize_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t channels, + size_t input_pixel_stride, + size_t output_pixel_stride, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_resize_bilinear2d_nhwc_f16( + xnn_operator_t resize_op, + void* workspace, + const void* input, + void* output); + +enum xnn_status xnn_create_resize_bilinear2d_nhwc_f32( + size_t output_height, + size_t output_width, + uint32_t flags, + xnn_operator_t* resize_op_out); + +enum xnn_status xnn_reshape_resize_bilinear2d_nhwc_f32( + xnn_operator_t resize_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t channels, + size_t input_pixel_stride, + size_t output_pixel_stride, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_resize_bilinear2d_nhwc_f32( + xnn_operator_t resize_op, + void* workspace, + const float* input, + float* output); + +enum xnn_status xnn_create_resize_bilinear2d_nhwc_s8( + size_t output_height, + size_t output_width, + uint32_t flags, + xnn_operator_t* resize_op_out); + +enum xnn_status xnn_reshape_resize_bilinear2d_nhwc_s8( + xnn_operator_t resize_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t channels, + size_t input_pixel_stride, + size_t output_pixel_stride, + size_t* workspace_size, + size_t* workspace, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_resize_bilinear2d_nhwc_s8( + xnn_operator_t resize_op, + void* workspace, + const int8_t* input, + int8_t* output); + +enum xnn_status xnn_create_resize_bilinear2d_nhwc_u8( + size_t output_height, + size_t output_width, + uint32_t flags, + xnn_operator_t* resize_op_out); + +enum xnn_status xnn_reshape_resize_bilinear2d_nhwc_u8( + xnn_operator_t resize_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t channels, + size_t input_pixel_stride, + size_t output_pixel_stride, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_resize_bilinear2d_nhwc_u8( + xnn_operator_t resize_op, + void* workspace, + const uint8_t* input, + uint8_t* output); + +enum xnn_status xnn_create_rope_nthc_f16( + size_t max_tokens, + uint32_t flags, + xnn_operator_t* rope_op_out); + +enum xnn_status xnn_reshape_rope_nthc_f16( + xnn_operator_t rope_op, + size_t batch_size, + size_t tokens, + size_t heads, + size_t channels, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_rope_nthc_f16( + xnn_operator_t rope_op, + const void* input, + const void* weights, + void* output); + +enum xnn_status xnn_create_rope_nthc_f32( + size_t max_tokens, + uint32_t flags, + xnn_operator_t* rope_op_out); + +enum xnn_status xnn_reshape_rope_nthc_f32( + xnn_operator_t rope_op, + size_t batch_size, + size_t tokens, + size_t heads, + size_t channels, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_rope_nthc_f32( + xnn_operator_t rope_op, + const float* input, + const float* weights, + float* output); + +// N: batch size +// H: number of heads +// T: tokens (sequence length) +// C: channels (head dimension) +enum xnn_status xnn_create_scaled_dot_product_attention_nhtc_f16( + enum xnn_attention_logits_cap_type cap_type, + const void* cap_params, + uint32_t flags, + xnn_operator_t* attention_op_out); + +enum xnn_status xnn_reshape_scaled_dot_product_attention_nhtc_f16( + xnn_operator_t attention_op, + size_t batch_size, + size_t query_heads, + // Number of tokens in query. + size_t query_tokens, + size_t key_value_heads, + // Number of tokens in key/value. For self-attention, this is same as tokens. + size_t key_value_tokens, + size_t query_key_channels, + size_t value_channels, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool); + +// Query is of dimension [batch_size, query_heads, query_tokens, channels]. +// Key and value are of dimension [batch_size, key_value_heads, key_value_tokens, channels]. +// Scale is of dimension [channels]. +// Mask is of dimension [query_tokens, key_value_tokens]. +enum xnn_status xnn_setup_scaled_dot_product_attention_nhtc_f16( + xnn_operator_t attention_op, + void* workspace, + const void* query, + const void* key, + const void* value, + const void* scale, + const void* mask, + void* output); + +// N: batch size +// H: number of heads +// T: tokens (sequence length) +// C: channels (head dimension) +enum xnn_status xnn_create_scaled_dot_product_attention_nhtc_f32( + enum xnn_attention_logits_cap_type cap_type, + const void* cap_params, + uint32_t flags, + xnn_operator_t* attention_op_out); + +enum xnn_status xnn_reshape_scaled_dot_product_attention_nhtc_f32( + xnn_operator_t attention_op, + size_t batch_size, + size_t query_heads, + // Number of tokens in query. + size_t query_tokens, + size_t key_value_heads, + // Number of tokens in key/value. For self-attention, this is same as tokens. + size_t key_value_tokens, + size_t query_key_channels, + size_t value_channels, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool); + +// Query is of dimension [batch_size, query_heads, query_tokens, query_key_channels]. +// Key and value are of dimension [batch_size, key_value_heads, key_value_tokens, query_key_channels]. +// Scale is of dimension [query_key_channels]. +// Mask is of dimension [query_tokens, key_value_tokens]. +// Output is of dimension [batch_size, query_heads, query_tokens, value_channels]. +enum xnn_status xnn_setup_scaled_dot_product_attention_nhtc_f32( + xnn_operator_t attention_op, + void* workspace, + const float* query, + const float* key, + const float* value, + const float* scale, + const float* mask, + float* output); + +enum xnn_status xnn_create_sigmoid_nc_f16( + uint32_t flags, + xnn_operator_t* sigmoid_op_out); + +enum xnn_status xnn_reshape_sigmoid_nc_f16( + xnn_operator_t sigmoid_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_sigmoid_nc_f16( + xnn_operator_t sigmoid_op, + const void* input, + void* output); + +enum xnn_status xnn_create_sigmoid_nc_f32( + uint32_t flags, + xnn_operator_t* sigmoid_op_out); + +enum xnn_status xnn_reshape_sigmoid_nc_f32( + xnn_operator_t sigmoid_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_sigmoid_nc_f32( + xnn_operator_t sigmoid_op, + const float* input, + float* output); + +enum xnn_status xnn_run_sigmoid_nc_f32( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const float* input, + float* output, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_sigmoid_nc_qs8( + int8_t input_zero_point, + float input_scale, + int8_t output_zero_point, + float output_scale, + int8_t output_min, + int8_t output_max, + uint32_t flags, + xnn_operator_t* sigmoid_op_out); + +enum xnn_status xnn_reshape_sigmoid_nc_qs8( + xnn_operator_t sigmoid_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_sigmoid_nc_qs8( + xnn_operator_t sigmoid_op, + const int8_t* input, + int8_t* output); + +enum xnn_status xnn_create_sigmoid_nc_qu8( + uint8_t input_zero_point, + float input_scale, + uint8_t output_zero_point, + float output_scale, + uint8_t output_min, + uint8_t output_max, + uint32_t flags, + xnn_operator_t* sigmoid_op_out); + +enum xnn_status xnn_reshape_sigmoid_nc_qu8( + xnn_operator_t sigmoid_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_sigmoid_nc_qu8( + xnn_operator_t sigmoid_op, + const uint8_t* input, + uint8_t* output); + +enum xnn_status xnn_create_slice_nd_x16( + uint32_t flags, + xnn_operator_t* slice_op_out); + +enum xnn_status xnn_reshape_slice_nd_x16( + xnn_operator_t slice_op, + size_t num_dims, + const size_t* input_shape, + const size_t* offsets, + const size_t* sizes, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_slice_nd_x16( + xnn_operator_t slice_op, + const void* input, + void* output); + +enum xnn_status xnn_create_slice_nd_x32( + uint32_t flags, + xnn_operator_t* slice_op_out); + +enum xnn_status xnn_reshape_slice_nd_x32( + xnn_operator_t slice_op, + size_t num_dims, + const size_t* input_shape, + const size_t* offsets, + const size_t* sizes, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_slice_nd_x32( + xnn_operator_t slice_op, + const void* input, + void* output); + +enum xnn_status xnn_run_slice_nd_x32( + size_t num_dims, + const size_t* input_shape, + const size_t* offsets, + const size_t* sizes, + const void* input, + void* output, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_softmax_nc_f16( + uint32_t flags, + xnn_operator_t* softmax_op_out); + +enum xnn_status xnn_reshape_softmax_nc_f16( + xnn_operator_t softmax_op, + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_softmax_nc_f16( + xnn_operator_t softmax_op, + const void* input, + void* output); + +enum xnn_status xnn_create_softmax_nc_f32( + uint32_t flags, + xnn_operator_t* softmax_op_out); + +enum xnn_status xnn_reshape_softmax_nc_f32( + xnn_operator_t softmax_op, + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_softmax_nc_f32( + xnn_operator_t softmax_op, + const float* input, + float* output); + +enum xnn_status xnn_create_softmax_nc_qu8( + float input_scale, + uint8_t output_zero_point, + float output_scale, + uint32_t flags, + xnn_operator_t* softmax_op_out); + +enum xnn_status xnn_reshape_softmax_nc_qu8( + xnn_operator_t softmax_op, + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_softmax_nc_qu8( + xnn_operator_t softmax_op, + const uint8_t* input, + uint8_t* output); + +enum xnn_status xnn_create_space_to_depth_nhwc_x16( + uint32_t block_size, + uint32_t flags, + xnn_operator_t* space_to_depth_op_out); + +enum xnn_status xnn_reshape_space_to_depth_nhwc_x16( + xnn_operator_t space_to_depth_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t input_channels, + size_t* output_height_out, + size_t* output_width_out, + size_t* output_channels_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_space_to_depth_nhwc_x16( + xnn_operator_t space_to_depth_op, + const void* input, + void* output); + +enum xnn_status xnn_create_space_to_depth_nhwc_x32( + uint32_t block_size, + uint32_t flags, + xnn_operator_t* space_to_depth_op_out); + +enum xnn_status xnn_reshape_space_to_depth_nhwc_x32( + xnn_operator_t space_to_depth_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t input_channels, + size_t* output_height_out, + size_t* output_width_out, + size_t* output_channels_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_space_to_depth_nhwc_x32( + xnn_operator_t space_to_depth_op, + const void* input, + void* output); + +enum xnn_status xnn_create_square_nc_f16( + uint32_t flags, + xnn_operator_t* square_op_out); + +enum xnn_status xnn_reshape_square_nc_f16( + xnn_operator_t square_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_square_nc_f16( + xnn_operator_t square_op, + const void* input, + void* output); + +enum xnn_status xnn_create_square_nc_f32( + uint32_t flags, + xnn_operator_t* square_op_out); + +enum xnn_status xnn_reshape_square_nc_f32( + xnn_operator_t square_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_square_nc_f32( + xnn_operator_t square_op, + const float* input, + float* output); + +enum xnn_status xnn_run_square_nc_f32( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const float* input, + float* output, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_square_root_nc_f16( + uint32_t flags, + xnn_operator_t* sqrt_op_out); + +enum xnn_status xnn_reshape_square_root_nc_f16( + xnn_operator_t sqrt_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_square_root_nc_f16( + xnn_operator_t sqrt_op, + const void* input, + void* output); + +enum xnn_status xnn_create_square_root_nc_f32( + uint32_t flags, + xnn_operator_t* sqrt_op_out); + +enum xnn_status xnn_reshape_square_root_nc_f32( + xnn_operator_t sqrt_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_square_root_nc_f32( + xnn_operator_t sqrt_op, + const float* input, + float* output); + +enum xnn_status xnn_run_square_root_nc_f32( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const float* input, + float* output, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_reciprocal_square_root_nc_f32( + uint32_t flags, xnn_operator_t* sqrt_op_out); + +enum xnn_status xnn_reshape_reciprocal_square_root_nc_f32( + xnn_operator_t sqrt_op, size_t batch_size, size_t channels, + size_t input_stride, size_t output_stride, pthreadpool_t threadpool); + +enum xnn_status xnn_setup_reciprocal_square_root_nc_f32(xnn_operator_t sqrt_op, + const float* input, + float* output); + +enum xnn_status xnn_run_reciprocal_square_root_nc_f32( + size_t channels, size_t input_stride, size_t output_stride, + size_t batch_size, const float* input, float* output, uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_squared_difference_nd_f16( + uint32_t flags, + xnn_operator_t* squared_difference_op_out); + +enum xnn_status xnn_reshape_squared_difference_nd_f16( + xnn_operator_t squared_difference_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_squared_difference_nd_f16( + xnn_operator_t squared_difference_op, + const void* input1, + const void* input2, + void* output); + +enum xnn_status xnn_create_squared_difference_nd_f32( + uint32_t flags, + xnn_operator_t* squared_difference_op_out); + +enum xnn_status xnn_reshape_squared_difference_nd_f32( + xnn_operator_t squared_difference_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_squared_difference_nd_f32( + xnn_operator_t squared_difference_op, + const float* input1, + const float* input2, + float* output); + +enum xnn_status xnn_run_squared_difference_nd_f32( + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + const float* input1, + const float* input2, + float* output, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_subtract_nd_f16( + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* subtract_op_out); + +enum xnn_status xnn_reshape_subtract_nd_f16( + xnn_operator_t subtract_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_subtract_nd_f16( + xnn_operator_t subtract_op, + const void* input1, + const void* input2, + void* output); + +enum xnn_status xnn_create_subtract_nd_f32( + float output_min, + float output_max, + uint32_t flags, + xnn_operator_t* subtract_op_out); + +enum xnn_status xnn_reshape_subtract_nd_f32( + xnn_operator_t subtract_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_subtract_nd_f32( + xnn_operator_t subtract_op, + const float* input1, + const float* input2, + float* output); + +enum xnn_status xnn_run_subtract_nd_f32( + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + const float* input1, + const float* input2, + float* output, + float output_min, + float output_max, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_subtract_nd_qs8( + int8_t input1_zero_point, + float input1_scale, + int8_t input2_zero_point, + float input2_scale, + int8_t output_zero_point, + float output_scale, + int8_t output_min, + int8_t output_max, + uint32_t flags, + xnn_operator_t* subtract_op_out); + +enum xnn_status xnn_reshape_subtract_nd_qs8( + xnn_operator_t subtract_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_subtract_nd_qs8( + xnn_operator_t subtract_op, + const int8_t* input1, + const int8_t* input2, + int8_t* output); + +enum xnn_status xnn_run_subtract_nd_qs8( + size_t num_input1_dims, + const size_t* input1_shape, + int8_t input1_zero_point, + float input1_scale, + size_t num_input2_dims, + const size_t* input2_shape, + int8_t input2_zero_point, + float input2_scale, + const int8_t* input1, + const int8_t* input2, + int8_t* output, + int8_t output_zero_point, + float output_scale, + int8_t output_min, + int8_t output_max, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_subtract_nd_qu8( + uint8_t input1_zero_point, + float input1_scale, + uint8_t input2_zero_point, + float input2_scale, + uint8_t output_zero_point, + float output_scale, + uint8_t output_min, + uint8_t output_max, + uint32_t flags, + xnn_operator_t* subtract_op_out); + +enum xnn_status xnn_reshape_subtract_nd_qu8( + xnn_operator_t subtract_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_subtract_nd_qu8( + xnn_operator_t subtract_op, + const uint8_t* input1, + const uint8_t* input2, + uint8_t* output); + +enum xnn_status xnn_run_subtract_nd_qu8( + size_t num_input1_dims, + const size_t* input1_shape, + uint8_t input1_zero_point, + float input1_scale, + size_t num_input2_dims, + const size_t* input2_shape, + uint8_t input2_zero_point, + float input2_scale, + const uint8_t* input1, + const uint8_t* input2, + uint8_t* output, + uint8_t output_zero_point, + float output_scale, + uint8_t output_min, + uint8_t output_max, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_tanh_nc_f16( + uint32_t flags, + xnn_operator_t* tanh_op_out); + +enum xnn_status xnn_reshape_tanh_nc_f16( + xnn_operator_t tanh_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_tanh_nc_f16( + xnn_operator_t tanh_op, + const void* input, + void* output); + +enum xnn_status xnn_create_tanh_nc_f32( + uint32_t flags, + xnn_operator_t* tanh_op_out); + +enum xnn_status xnn_reshape_tanh_nc_f32( + xnn_operator_t tanh_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_tanh_nc_f32( + xnn_operator_t tanh_op, + const float* input, + float* output); + +enum xnn_status xnn_run_tanh_nc_f32( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const float* input, + float* output, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_tanh_nc_qs8( + int8_t input_zero_point, + float input_scale, + int8_t output_zero_point, + float output_scale, + int8_t output_min, + int8_t output_max, + uint32_t flags, + xnn_operator_t* tanh_op_out); + +enum xnn_status xnn_reshape_tanh_nc_qs8( + xnn_operator_t tanh_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_tanh_nc_qs8( + xnn_operator_t tanh_op, + const int8_t* input, + int8_t* output); + +enum xnn_status xnn_create_tanh_nc_qu8( + uint8_t input_zero_point, + float input_scale, + uint8_t output_zero_point, + float output_scale, + uint8_t output_min, + uint8_t output_max, + uint32_t flags, + xnn_operator_t* tanh_op_out); + +enum xnn_status xnn_reshape_tanh_nc_qu8( + xnn_operator_t tanh_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_tanh_nc_qu8( + xnn_operator_t tanh_op, + const uint8_t* input, + uint8_t* output); + +enum xnn_status xnn_create_transpose_nd_x8( + uint32_t flags, + xnn_operator_t* transpose_op_out); + +enum xnn_status xnn_reshape_transpose_nd_x8( + xnn_operator_t transpose_op, + size_t num_dims, + const size_t* input_shape, + const size_t* output_perm, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_transpose_nd_x8( + xnn_operator_t transpose_op, + const void* input, + void* output); + +enum xnn_status xnn_run_transpose_nd_x8( + const void* input, + void* output, + size_t num_dims, + const size_t* input_shape, + const size_t* output_perm, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_transpose_nd_x16( + uint32_t flags, + xnn_operator_t* transpose_op_out); + +enum xnn_status xnn_reshape_transpose_nd_x16( + xnn_operator_t transpose_op, + size_t num_dims, + const size_t* input_shape, + const size_t* output_perm, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_transpose_nd_x16( + xnn_operator_t transpose_op, + const void* input, + void* output); + +enum xnn_status xnn_run_transpose_nd_x16( + const void* input, + void* output, + size_t num_dims, + const size_t* input_shape, + const size_t* output_perm, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_transpose_nd_x32( + uint32_t flags, + xnn_operator_t* transpose_op_out); + +enum xnn_status xnn_reshape_transpose_nd_x32( + xnn_operator_t transpose_op, + size_t num_dims, + const size_t* input_shape, + const size_t* output_perm, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_transpose_nd_x32( + xnn_operator_t transpose_op, + const void* input, + void* output); + +enum xnn_status xnn_run_transpose_nd_x32( + const void* input, + void* output, + size_t num_dims, + const size_t* input_shape, + const size_t* output_perm, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_transpose_nd_x64( + uint32_t flags, + xnn_operator_t* transpose_op_out); + +enum xnn_status xnn_reshape_transpose_nd_x64( + xnn_operator_t transpose_op, + size_t num_dims, + const size_t* input_shape, + const size_t* output_perm, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_transpose_nd_x64( + xnn_operator_t transpose_op, + const void* input, + void* output); + +enum xnn_status xnn_run_transpose_nd_x64( + const void* input, + void* output, + size_t num_dims, + const size_t* input_shape, + const size_t* output_perm, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_truncation_nc_f16( + uint32_t flags, + xnn_operator_t* truncation_op_out); + +enum xnn_status xnn_reshape_truncation_nc_f16( + xnn_operator_t truncation_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_truncation_nc_f16( + xnn_operator_t truncation_op, + const void* input, + void* output); + +enum xnn_status xnn_create_truncation_nc_f32( + uint32_t flags, + xnn_operator_t* truncation_op_out); + +enum xnn_status xnn_reshape_truncation_nc_f32( + xnn_operator_t truncation_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_truncation_nc_f32( + xnn_operator_t truncation_op, + const float* input, + float* output); + +enum xnn_status xnn_run_truncation_nc_f32( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const float* input, + float* output, + uint32_t flags, + pthreadpool_t threadpool); + +enum xnn_status xnn_create_unpooling2d_nhwc_x32( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t pooling_height, + uint32_t pooling_width, + size_t channels, + size_t input_pixel_stride, + size_t output_pixel_stride, + uint32_t flags, + xnn_operator_t* unpooling_op_out); + +enum xnn_status xnn_reshape_unpooling2d_nhwc_x32( + xnn_operator_t unpooling_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t* output_height_out, + size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_unpooling2d_nhwc_x32( + xnn_operator_t unpooling_op, + const void* input, + const uint32_t* index, + void* output); + +enum xnn_status xnn_create_slice_nd_x8( + uint32_t flags, + xnn_operator_t* slice_op_out); + +enum xnn_status xnn_reshape_slice_nd_x8( + xnn_operator_t slice_op, + size_t num_dims, + const size_t* input_shape, + const size_t* offsets, + const size_t* sizes, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_slice_nd_x8( + xnn_operator_t slice_op, + const void* input, + void* output); + +enum xnn_status xnn_create_space_to_depth_nhwc_x8( + uint32_t block_size, + uint32_t flags, + xnn_operator_t* space_to_depth_op_out); + +enum xnn_status xnn_reshape_space_to_depth_nhwc_x8( + xnn_operator_t space_to_depth_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t input_channels, + size_t* output_height_out, + size_t* output_width_out, + size_t* output_channels_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_space_to_depth_nhwc_x8( + xnn_operator_t space_to_depth_op, + const void* input, + void* output); + +#ifdef __cplusplus +} // extern "C" +#endif