diff --git a/.gitattributes b/.gitattributes
index 0b10ef2aa75c0640f04a077b4f6b8df5dea6fab5..cc09dcb315e618ca061005ac6cab8c4251372fbf 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -142,3 +142,6 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
 .venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/simd.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/halide.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/wrapper.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cpu.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/common.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/common.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/common.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a49ebeb21efcfc629f6282c3b21664840e7337d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/common.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee19b8e0b980d0895a7af50aa7c3244d133ce110a196485ab8cec5fa7b9767d4
+size 121452
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e2211e38b1ee8a071bceec78156e3a646d70f07
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8596ce3d305b9ea76fd93737e3fda25769b1901142db9efff0fde9757b03517
+size 262897
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cpu.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cpu.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e326f6006cfa958bb3f6b92bc77d5d59ee49fc8e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cpu.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de158b207dec0ef6dd7cca5acc1db68fcc605b0046ed6c5ffcf0d9b8f34d3b82
+size 138985
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ae13108b6a38ad8b359e6666ff1318f593095e3
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/b2b_gemm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/b2b_gemm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc384dccfb91fb6dcf38b5fc9148272c38d44701
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/b2b_gemm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/binary_folding.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/binary_folding.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..733fc8d27968b2a245b81554b03297e3be206091
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/binary_folding.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/decompose_mem_bound_mm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/decompose_mem_bound_mm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02f8ba3fd51a84088eccaf74cc025824a721a4dc
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/decompose_mem_bound_mm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/dedupe_symint_uses.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/dedupe_symint_uses.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb7e59e175f2b51b5089030a60d99c13865c8bfc
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/dedupe_symint_uses.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/efficient_conv_bn_eval.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/efficient_conv_bn_eval.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67cedd2943e258946fba3ffe16a6381d81df2a9a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/efficient_conv_bn_eval.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/freezing_patterns.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/freezing_patterns.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..05416037695c63d94dc0be2b450bd696ca1f1ee8
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/freezing_patterns.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/fuse_attention.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/fuse_attention.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2596968b3d2336bfb58eae6b7bbec8e410c4bec
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/fuse_attention.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/group_batch_fusion.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/group_batch_fusion.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ab211df50445bd0e0ae8d1e704ec137f1d39512
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/group_batch_fusion.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/joint_graph.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/joint_graph.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ea534f93f2fb94e2719bc25fd65001de2d09f31
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/joint_graph.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/micro_pipeline_tp.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/micro_pipeline_tp.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f50fa4ac65efa3635e534ad0ba84bee210897bc3
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/micro_pipeline_tp.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/misc_patterns.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/misc_patterns.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2549dca51ec88e891ca03d2c3ad808d86334e9f7
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/misc_patterns.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/mkldnn_fusion.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/mkldnn_fusion.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93563fe19207e92a05c8992c031ff31808c0158d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/mkldnn_fusion.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/numeric_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/numeric_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..60083539a8f660f702dfc111d70273d5b63d0c8a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/numeric_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pad_mm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pad_mm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67c923407c6cfc76576838be92e269d7527b18e6
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pad_mm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/post_grad.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/post_grad.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f9ddd9680001b5937b43eee6cf210d56ca98861
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/post_grad.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pre_grad.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pre_grad.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58c58ac4cd4ba85c625ddb8545282d060c0106c9
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pre_grad.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/quantization.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/quantization.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c925ffc945d520c500efe94a066f8d341ff7dab
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/quantization.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/reinplace.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/reinplace.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a00dacc4d679317e150976db86f9c417f191c111
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/reinplace.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/replace_random.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/replace_random.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e7ce7d9590488e2ff848fc99109754b9f53543de
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/replace_random.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__init__.py b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b17d76e12794b8407063c84d5dbb55b3aac25c99
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__init__.py
@@ -0,0 +1 @@
+from . import mm, mm_common, mm_plus_mm, unpack_mixed_mm
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95eafd715ddf717597e38acac78c24bb1addc141
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/bmm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/bmm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..439ca3179d3eb0c7bd183c4b805eef127134005e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/bmm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/conv.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/conv.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a83ec5ca0cfc13e7972e784f63bb57cb038ecb3
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/conv.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/flex_attention.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/flex_attention.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4dfd0f99df0f6f8d7c1045d0e14a8726129e7439
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/flex_attention.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/flex_decoding.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/flex_decoding.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..860f94cffcb0b46f249f15f029d93e61198403b1
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/flex_decoding.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31abf219caacd182d3500f9e6f0284521514a883
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_common.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_common.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..741b3768a6fd15a6efbd1aaae1b5dbf61acac946
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_common.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_plus_mm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_plus_mm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93637ad8c893b8d84a23f46cf37219b4b618a4b6
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_plus_mm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_scaled.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_scaled.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f28d584150079a847127f642f02a6a478c612f17
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_scaled.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/unpack_mixed_mm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/unpack_mixed_mm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d9744724e3a503795e13cc2b0aaec8668017fafd
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/unpack_mixed_mm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/bmm.py b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/bmm.py
new file mode 100644
index 0000000000000000000000000000000000000000..428ff06d4a052fa38fec17df8ad7a1d6591b507c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/bmm.py
@@ -0,0 +1,192 @@
+# mypy: allow-untyped-defs
+import logging
+
+import torch
+
+from .. import ir, lowering as L
+from ..select_algorithm import (
+    autotune_select_algorithm,
+    ExternKernelChoice,
+    TritonTemplate,
+)
+from ..utils import (
+    ceildiv as cdiv,
+    use_aten_gemm_kernels,
+    use_cutlass_template,
+    use_triton_template,
+)
+from ..virtualized import V
+from .mm import _is_static_problem
+from .mm_common import addmm_epilogue, mm_args, mm_configs, mm_options
+
+
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+
+
+def bmm_grid(b, m, n, meta):
+    return (cdiv(m, meta["BLOCK_M"]) * cdiv(n, meta["BLOCK_N"]), b, 1)
+
+
+bmm_template = TritonTemplate(
+    name="bmm",
+    grid=bmm_grid,
+    source=r"""
+{{def_kernel("A", "B")}}
+    M = {{size("A", -2)}}
+    N = {{size("B", -1)}}
+    K = {{size("A", -1)}}
+
+    stride_aq = {{stride("A", 0)}}
+    stride_am = {{stride("A", 1)}}
+    stride_ak = {{stride("A", 2)}}
+
+    stride_bq = {{stride("B", 0)}}
+    stride_bk = {{stride("B", 1)}}
+    stride_bn = {{stride("B", 2)}}
+
+    # based on triton.ops.matmul
+    pid = tl.program_id(0)
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    if (stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1):
+        ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    else:
+        ram = rm % M
+    if (stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1):
+        rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    else:
+        rbn = rn % N
+
+    rk = tl.arange(0, BLOCK_K)
+
+    idx_q = tl.program_id(1)  # batch dimension for BMM
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak + idx_q*stride_aq)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn + idx_q*stride_bq)
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(K, 0, -BLOCK_K):
+        if EVEN_K:
+            a = tl.load(A)
+            b = tl.load(B)
+        else:
+            a = tl.load(A, mask=rk[None, :] < k, other=0.)
+            b = tl.load(B, mask=rk[:, None] < k, other=0.)
+        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+
+    # rematerialize rm and rn to save registers
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    idx_q = tl.program_id(1)  # batch dimension for BMM
+    idx_m = rm[:, None]
+    idx_n = rn[None, :]
+    mask = (idx_m < M) & (idx_n < N)
+
+    # inductor generates a suffix
+    {{store_output(("idx_q", "idx_m", "idx_n"), "acc", "mask")}}
+""",
+)
+
+aten_bmm = ExternKernelChoice(torch.bmm, "at::bmm_out")
+aten_baddbmm = ExternKernelChoice(torch.baddbmm, "at::baddbmm_out")
+
+
+@L.register_lowering(aten.bmm)
+def tuned_bmm(mat1, mat2, *, layout=None):
+    if all(x.get_device().type == "cpu" for x in [mat1, mat2]):
+        # decompose to small ops when memory bound
+        if mat1.get_size()[1] == 1 or mat2.get_size()[2] == 1:
+            mat1 = L.unsqueeze(mat1, -1)
+            mat2 = L.unsqueeze(mat2, 1)
+            return L.sum_(L.mul(mat1, mat2), axis=2)
+
+        def is_valid_to_require_contiguous(t):
+            if not ir.is_storage_and_layout(t):
+                return True
+            _, layout = ir.as_storage_and_layout(t, freeze=False)
+            return isinstance(layout, ir.FlexibleLayout)
+
+        def is_preferred_layout_as_bmm_input(sizes, strides):
+            # contiguous on one of the last two dims
+            return (
+                strides[-1] == 1 and (sizes[-2] == 1 or strides[-2] >= sizes[-1])
+            ) or (strides[-2] == 1 and (sizes[-1] == 1 or strides[-1] >= sizes[-2]))
+
+        # Make the input of bmm contiguous
+        # if it is not contiguous on either of the last two dims,
+        # because bmm cpu implementation would do contiguous() if not.
+        # This is to avoid additional copies in bmm.
+        def may_require_contiguous(t, meta_t):
+            sizes = meta_t.meta["val"].size()
+            strides = meta_t.meta["val"].stride()
+            if not is_preferred_layout_as_bmm_input(sizes, strides):
+                t = ir.ExternKernel.require_contiguous(t)
+            return t
+
+        if is_valid_to_require_contiguous(mat1):
+            meta_mat1 = V.graph.current_node.args[0]
+            mat1 = may_require_contiguous(mat1, meta_mat1)
+        if is_valid_to_require_contiguous(mat2):
+            meta_mat2 = V.graph.current_node.args[1]
+            mat2 = may_require_contiguous(mat2, meta_mat2)
+
+    m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
+
+    # options to tune from
+    choices = [aten_bmm.bind((mat1, mat2), layout)] if use_aten_gemm_kernels() else []
+    if use_triton_template(layout):
+        for config in mm_configs(m, n, k):
+            bmm_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+            )
+    static_shape, is_nonzero = _is_static_problem([mat1, mat2], layout)
+    if static_shape and is_nonzero and use_cutlass_template(layout, m, n, k):
+        from ..codegen.cuda.gemm_template import CUTLASS3xGemmTemplate
+
+        CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(choices, layout, [mat1, mat2])
+
+    if len(choices) == 0:
+        log.warning("No choices for GEMM, using ATen backend as fallback")
+        choices.append(aten_bmm.bind((mat1, mat2), layout))
+
+    return autotune_select_algorithm("bmm", choices, [mat1, mat2], layout)
+
+
+# Don't register this since it is slower than decomposing it
+# @L.register_lowering(aten.baddbmm)
+def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
+    m, n, k, layout, mat1, mat2, inp = mm_args(mat1, mat2, inp, layout=layout)
+
+    # options to tune from
+    choices = (
+        [aten_baddbmm.bind((inp, mat1, mat2), layout, alpha=alpha, beta=beta)]
+        if use_aten_gemm_kernels()
+        else []
+    )
+    if use_triton_template(layout):
+        for config in mm_configs(m, n, k):
+            bmm_template.maybe_append_choice(
+                choices,
+                input_nodes=(inp, mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+                prefix_args=1,
+                epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
+            )
+
+    return autotune_select_algorithm("baddbmm", choices, [inp, mat1, mat2], layout)
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/conv.py b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..b69143fe0301545c53557886ed5701674e57565a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/conv.py
@@ -0,0 +1,679 @@
+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+import functools
+import logging
+from typing import cast, List, Optional, Sequence, Tuple, TYPE_CHECKING, TypedDict
+
+import torch
+
+from .. import config, ir
+from ..lowering import (
+    add_layout_constraint,
+    constrain_to_fx_strides,
+    lowerings as L,
+    register_lowering,
+)
+from ..select_algorithm import (
+    autotune_select_algorithm,
+    ExternKernelChoice,
+    TritonTemplate,
+)
+from ..utils import (
+    ceildiv,
+    is_ones,
+    is_zeros,
+    pad_listlike,
+    sympy_product,
+    use_triton_template,
+)
+from ..virtualized import V
+from .mm_common import filtered_configs
+
+
+if TYPE_CHECKING:
+    from ..ir import TensorBox
+
+log = logging.getLogger(__name__)
+
+
+aten = torch.ops.aten
+
+
+def conv2d_grid(n, c, h, w, meta):
+    return (
+        ceildiv(n * h * w, meta["BLOCK_M"]),
+        ceildiv(c, meta["BLOCK_N"]),
+        meta["GROUPS"],
+    )
+
+
+def conv3d_grid(n, c, d, h, w, meta):
+    return (
+        ceildiv(n * d * h * w, meta["BLOCK_M"]),
+        ceildiv(c, meta["BLOCK_N"]),
+        meta["GROUPS"],
+    )
+
+
+# List of dictionaries to store the kernel configs. Configs that evaluate to true
+# will be utilised on the target platform
+kernel_configs = [
+    # "BLOCK_M", "BLOCK_N", "BLOCK_K", "num_stages", "num_warps"
+    {"config": (64, 256, 16, 2, 4), "cond": True},
+    {"config": (256, 64, 16, 2, 4), "cond": True},
+    {"config": (1024, 16, 16, 1, 8), "cond": True},
+    {"config": (128, 128, 32, 2, 8), "cond": True},
+    {"config": (64, 64, 32, 2, 4), "cond": True},
+    {"config": (64, 256, 32, 2, 8), "cond": True},
+    {"config": (256, 64, 32, 2, 8), "cond": True},
+]
+
+# Create filtered list of configs based on conv
+platform_configs = tuple(
+    cast(Tuple[int, int, int, int, int], config["config"])
+    for config in kernel_configs
+    if config["cond"]
+)
+
+# On ROCm convert num_stages to 1 as pipelining provides no benefit
+if torch.version.hip:
+    platform_configs = tuple(
+        (config[0], config[1], config[2], 1, config[4]) for config in platform_configs
+    )
+
+conv_configs = functools.partial(
+    filtered_configs,
+    configs=platform_configs,
+)
+
+LOOP_BODY_2D = """
+        idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
+        idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
+        idx_x_c = tl.arange(0, BLOCK_K) + k
+
+        x_ptrs = x_base + (
+            (idx_x_h * stride_xh)[:, None]
+            + (idx_x_w * stride_xw)[:, None]
+            + (idx_x_c * stride_xc)[None, :]
+        )
+        mask_x = (
+            (idx_n < BATCH)[:, None]
+            & (idx_x_h >= 0)[:, None]
+            & (idx_x_h < IN_H)[:, None]
+            & (idx_x_w >= 0)[:, None]
+            & (idx_x_w < IN_W)[:, None]
+            & (idx_x_c < GROUP_IN_C)[None, :]
+        )
+        matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0)
+
+        w_ptrs = w_base + (
+            (idx_x_c * stride_wc_in)[:, None] + (i * stride_wh) + (j * stride_ww)
+        )
+        mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
+        matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
+        acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
+"""
+
+"""
+This is a relatively simple conv implementation that can likely be
+improved.  Many alternate conv versions can be found here:
+https://github.com/pytorch/torchdynamo/pull/971
+"""
+conv2d_template = TritonTemplate(
+    name="convolution2d",
+    grid=conv2d_grid,
+    source=r"""
+{{def_kernel("X", "W")}}
+    # Tensor dimensions
+    BATCH = {{size("X", 0)}}
+    IN_C = {{size("X", 1)}}
+    IN_H = {{size("X", 2)}}
+    IN_W = {{size("X", 3)}}
+    OUT_C = {{size(None, 1)}}
+    OUT_H = {{size(None, 2)}}
+    OUT_W = {{size(None, 3)}}
+
+    # Strides:
+    stride_xn = {{stride("X", 0)}}
+    stride_xc = {{stride("X", 1)}}
+    stride_xh = {{stride("X", 2)}}
+    stride_xw = {{stride("X", 3)}}
+    stride_wc_out = {{stride("W", 0)}}
+    stride_wc_in = {{stride("W", 1)}}
+    stride_wh = {{stride("W", 2)}}
+    stride_ww = {{stride("W", 3)}}
+
+    nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
+    idx_y_w = nhw % OUT_W
+    nh = nhw // OUT_W
+    idx_y_h = nh % OUT_H
+    idx_n = nh // OUT_H
+    idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
+
+{% if GROUPS == 1 %}
+    group = 0
+    GROUP_IN_C = IN_C
+    GROUP_OUT_C = OUT_C
+{% else %}
+    group = tl.program_id(2)
+    GROUP_IN_C = IN_C // GROUPS
+    GROUP_OUT_C = OUT_C // GROUPS
+{% endif %}
+
+    x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None]
+    w_base = (
+        W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :]
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+{% if UNROLL %}
+{% for i in range(KERNEL_H) %}
+{% for j in range(KERNEL_W) %}
+    i = {{i}}
+    j = {{j}}
+    for k in range(0, GROUP_IN_C, BLOCK_K):
+        """
+    + LOOP_BODY_2D
+    + """
+{% endfor %}
+{% endfor %}
+{% else %}
+    # Could be simplified, but slightly slower:
+    # for i in range(KERNEL_H):
+    #     for j in range(KERNEL_W):
+    #         for k in range(0, GROUP_IN_C, BLOCK_K):
+    BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K
+    for ijk in range(KERNEL_H * KERNEL_W * BLOCK_K_COUNT):
+        k = (ijk % BLOCK_K_COUNT) * BLOCK_K
+        ij = ijk // BLOCK_K_COUNT
+        i = ij // KERNEL_W
+        j = ij % KERNEL_W
+        """
+    + LOOP_BODY_2D
+    + """
+{% endif %}
+
+    mask = (
+        (idx_n < BATCH)[:, None]
+        & (idx_y_h < OUT_H)[:, None]
+        & (idx_y_w < OUT_W)[:, None]
+        & (idx_y_c < GROUP_OUT_C)[None, :]
+    )
+    idx_n = idx_n[:, None]
+    idx_c = idx_y_c[None, :] + group * GROUP_OUT_C
+    idx_h = idx_y_h[:, None]
+    idx_w = idx_y_w[:, None]
+
+    # inductor generates a suffix
+    {{store_output(("idx_n", "idx_c", "idx_h", "idx_w"), "acc", "mask")}}
+""",
+)
+
+LOOP_BODY_3D = """
+        idx_x_d = d - PADDING_D + idx_y_d * STRIDE_D
+        idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
+        idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
+        idx_x_c = tl.arange(0, BLOCK_K) + k
+
+        x_ptrs = x_base + (
+            (idx_x_d * stride_xd)[:, None]
+            + (idx_x_h * stride_xh)[:, None]
+            + (idx_x_w * stride_xw)[:, None]
+            + (idx_x_c * stride_xc)[None, :]
+        )
+        mask_x = (
+            (idx_n < BATCH)[:, None]
+            & (idx_x_d >= 0)[:, None]
+            & (idx_x_d < IN_D)[:, None]
+            & (idx_x_h >= 0)[:, None]
+            & (idx_x_h < IN_H)[:, None]
+            & (idx_x_w >= 0)[:, None]
+            & (idx_x_w < IN_W)[:, None]
+            & (idx_x_c < GROUP_IN_C)[None, :]
+        )
+        matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0)
+
+        w_ptrs = w_base + (
+            (idx_x_c * stride_wc_in)[:, None] +
+            (d * stride_wd) + (i * stride_wh) + (j * stride_ww)
+        )
+        mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
+        matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
+        acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
+"""
+
+conv3d_template = TritonTemplate(
+    name="convolution3d",
+    grid=conv3d_grid,
+    source=r"""
+{{def_kernel("X", "W")}}
+    # Tensor dimensions
+    BATCH = {{size("X", 0)}}
+    IN_C = {{size("X", 1)}}
+    IN_D = {{size("X", 2)}}
+    IN_H = {{size("X", 3)}}
+    IN_W = {{size("X", 4)}}
+    OUT_C = {{size(None, 1)}}
+    OUT_D = {{size(None, 2)}}
+    OUT_H = {{size(None, 3)}}
+    OUT_W = {{size(None, 4)}}
+
+    # Strides:
+    stride_xn = {{stride("X", 0)}}
+    stride_xc = {{stride("X", 1)}}
+    stride_xd = {{stride("X", 2)}}
+    stride_xh = {{stride("X", 3)}}
+    stride_xw = {{stride("X", 4)}}
+    stride_wc_out = {{stride("W", 0)}}
+    stride_wc_in = {{stride("W", 1)}}
+    stride_wd = {{stride("W", 2)}}
+    stride_wh = {{stride("W", 3)}}
+    stride_ww = {{stride("W", 4)}}
+
+    ndhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
+    idx_y_w = ndhw % OUT_W
+    ndh = ndhw // OUT_W
+    idx_y_h = ndh % OUT_H
+    nd = ndh // OUT_H
+    idx_y_d = nd % OUT_D
+    idx_n = nd // OUT_D
+    idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
+
+{% if GROUPS == 1 %}
+    group = 0
+    GROUP_IN_C = IN_C
+    GROUP_OUT_C = OUT_C
+{% else %}
+    group = tl.program_id(2)
+    GROUP_IN_C = IN_C // GROUPS
+    GROUP_OUT_C = OUT_C // GROUPS
+{% endif %}
+
+    x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None]
+    w_base = (
+        W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :]
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+{% if UNROLL %}
+{% for d in range(KERNEL_D) %}
+{% for i in range(KERNEL_H) %}
+{% for j in range(KERNEL_W) %}
+    d = {{d}}
+    i = {{i}}
+    j = {{j}}
+    for k in range(0, GROUP_IN_C, BLOCK_K):
+        """
+    + LOOP_BODY_3D
+    + """
+{% endfor %}
+{% endfor %}
+{% endfor %}
+{% else %}
+    # Could be simplified, but slightly slower:
+    # for d in range(KERNEL_D):
+    #   for i in range(KERNEL_H):
+    #     for j in range(KERNEL_W):
+    #         for k in range(0, GROUP_IN_C, BLOCK_K):
+    BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K
+    for dijk in range(KERNEL_D * KERNEL_H * KERNEL_W * BLOCK_K_COUNT):
+        k = (dijk % BLOCK_K_COUNT) * BLOCK_K
+        dij = dijk // BLOCK_K_COUNT
+        j = dij % KERNEL_W
+        di = dij // KERNEL_W
+        i = di % KERNEL_H
+        d = di // KERNEL_H
+        """
+    + LOOP_BODY_3D
+    + """
+{% endif %}
+
+    mask = (
+        (idx_n < BATCH)[:, None]
+        & (idx_y_d < OUT_D)[:, None]
+        & (idx_y_h < OUT_H)[:, None]
+        & (idx_y_w < OUT_W)[:, None]
+        & (idx_y_c < GROUP_OUT_C)[None, :]
+    )
+    idx_n = idx_n[:, None]
+    idx_c = idx_y_c[None, :] + group * GROUP_OUT_C
+    idx_d = idx_y_d[:, None]
+    idx_h = idx_y_h[:, None]
+    idx_w = idx_y_w[:, None]
+
+    # inductor generates a suffix
+    {{store_output(("idx_n", "idx_c", "idx_d", "idx_h", "idx_w"), "acc", "mask")}}
+""",
+)
+
+aten_convolution = ExternKernelChoice(
+    torch.convolution,
+    "at::convolution",
+    has_out_variant=False,
+    op_overload=aten.convolution.default,
+)
+
+
+def conv1x1_via_mm(x, w, *, out):
+    w = torch.squeeze(torch.squeeze(w, -1), -1)
+    return torch.matmul(
+        x.permute(0, 2, 3, 1), w.permute(1, 0), out=out.permute(0, 2, 3, 1)
+    )
+
+
+aten_conv1x1_via_mm = ExternKernelChoice(conv1x1_via_mm, None)
+
+
+class ConvLayoutParams(TypedDict):
+    stride: tuple[int, ...]
+    padding: tuple[int, ...]
+    dilation: tuple[int, ...]
+    transposed: bool
+    output_padding: tuple[int, ...]
+    groups: int
+
+
+def conv_layout(
+    x: TensorBox,
+    weight: TensorBox,
+    bias: Optional[TensorBox],
+    stride: Sequence[int],
+    padding: tuple[int, ...],
+    dilation: tuple[int, ...],
+    transposed: bool,
+    output_padding: tuple[int, ...],
+    groups: int,
+) -> ir.Layout:
+    """Determine output layout for a convolution"""
+    with V.graph.fake_mode:
+        output = torch.ops.aten.convolution(
+            ir.ir_node_to_tensor(x, guard_shape=True),
+            ir.ir_node_to_tensor(weight, guard_shape=True),
+            ir.ir_node_to_tensor(bias, guard_shape=True),
+            V.graph.sizevars.size_hints(stride),  # type: ignore[arg-type]
+            V.graph.sizevars.size_hints(padding),  # type: ignore[arg-type]
+            V.graph.sizevars.size_hints(dilation),  # type: ignore[arg-type]
+            transposed,
+            V.graph.sizevars.size_hints(output_padding),  # type: ignore[arg-type]
+            groups,
+        )
+        sizes = ir.convert_shape_to_inductor(output.size())
+        stride = ir.convert_shape_to_inductor(output.stride())  # type: ignore[assignment]
+
+    return ir.FixedLayout(
+        x.get_device(),
+        x.get_dtype(),
+        sizes,
+        stride,
+    )
+
+
+def channels_last_order(rank):
+    order = list(reversed(range(rank)))
+    order.insert(1, order.pop(-1))
+    return order
+
+
+def convert_1x1_conv_to_mm(x, weight, bias):
+    # special case for 1x1 convolution, which is actually just a matmul
+    rank = len(weight.get_size())
+    for _ in range(rank - 2):
+        weight = L[aten.squeeze](weight, dim=-1)
+    weight = L[aten.permute](weight, [1, 0])
+
+    x = ir.ExternKernel.require_stride_order(x, channels_last_order(rank))
+    x_permute = list(range(rank))
+    x_permute.append(x_permute.pop(1))
+    x = L[aten.permute](x, x_permute)
+    *sizes, in_chan = x.get_size()
+    x = L[aten.reshape](x, [sympy_product(sizes), in_chan])
+    if bias is None:
+        result = L[aten.mm](x, weight)
+    else:
+        result = L[aten.addmm](bias, x, weight)
+    result = L[aten.reshape](result, [*sizes, -1])
+    result_permute = list(range(rank))
+    result_permute.insert(1, result_permute.pop(-1))
+    return L[aten.permute](result, result_permute)
+
+
+@register_lowering(aten.convolution)
+def convolution(
+    x: TensorBox,
+    weight: TensorBox,
+    bias: TensorBox,
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    transposed: bool,
+    output_padding: List[int],
+    groups: int,
+):
+    stride = tuple(stride)
+    padding = tuple(padding)
+    dilation = tuple(dilation)
+    output_padding = tuple(output_padding)
+    if not isinstance(groups, int):
+        groups = V.graph.sizevars.evaluate_static_shape(groups)
+    assert isinstance(groups, int)
+
+    # Need use hint for triton template since the template does not
+    # work with a dynamic shape.
+    #
+    # No need to evaluate_static_shape for dilation and output_padding
+    # since the template is only used when dilation is 1 and output_padding
+    # is 0.
+    stride = tuple(V.graph.sizevars.evaluate_static_shapes(stride))
+    padding = tuple(V.graph.sizevars.evaluate_static_shapes(padding))
+
+    kwargs: ConvLayoutParams = {
+        "stride": stride,
+        "padding": padding,
+        "dilation": dilation,
+        "transposed": transposed,
+        "output_padding": output_padding,
+        "groups": groups,
+    }
+
+    if len(x.get_size()) == len(weight.get_size()) - 1:
+        # add batch dimension to simplify rest of function
+        return L[aten.squeeze](
+            convolution(L[aten.expand](x, [1, *x.get_size()]), weight, bias, **kwargs),
+            dim=0,
+        )
+
+    out_chan, in_chan, *kernel_shape = V.graph.sizevars.evaluate_static_shapes(
+        weight.get_size()
+    )
+    ndim = len(kernel_shape)
+    stride = pad_listlike(stride, ndim)
+    padding = pad_listlike(padding, ndim)
+    dilation = pad_listlike(dilation, ndim)
+    output_padding = pad_listlike(output_padding, ndim)
+
+    def channels_last_conv():
+        if V.graph.layout_opt and ndim == 2:
+            return True
+
+        layout = conv_layout(x, weight, None, **kwargs)
+        req_stride_order = ir.get_stride_order(
+            V.graph.sizevars.size_hints(layout.stride)
+        )
+        return req_stride_order == ir.NHWC_STRIDE_ORDER
+
+    autotuning_gemm = config.max_autotune or config.max_autotune_gemm
+
+    if (
+        (config.conv_1x1_as_mm or (autotuning_gemm and channels_last_conv()))
+        and is_ones(kernel_shape)
+        and is_ones(stride)
+        and is_zeros(padding)
+        and is_ones(dilation)
+        and not transposed
+        and is_zeros(output_padding)
+        and groups == 1
+        and V.graph.sizevars.statically_known_gt(sympy_product(x.get_size()), 0)
+    ):
+        return convert_1x1_conv_to_mm(x, weight, bias)
+
+    if bias is not None and ir.get_device_type(x) != "cpu":
+        # peel off the bias, cudnn is slower with it
+        result = convolution(x, weight, None, **kwargs)
+        return L[aten.add](
+            result, L[aten.view](bias, [result.get_size()[1]] + ndim * [1])
+        )
+
+    x.realize()
+    weight.realize()
+
+    # ndim can be 1 for convolution in models such as demucs
+    # TODO: check if it's beneficial to convert Conv1d to Conv2d and then
+    # apply channels last.
+    if V.graph.layout_opt and ndim == 2:
+        V.graph.num_channels_last_conv += 1
+        x = ir.ExternKernel.require_channels_last(x)
+        # TODO maybe we can convert weights to channels last just once before
+        # running the model.
+        weight = ir.ExternKernel.require_channels_last(weight)
+        layout = conv_layout(x, weight, None, **kwargs)
+    else:
+        layout = conv_layout(x, weight, None, **kwargs)
+        req_stride_order = ir.get_stride_order(
+            V.graph.sizevars.size_hints(layout.stride)
+        )
+        x = ir.ExternKernel.require_stride_order(x, req_stride_order)
+        weight = ir.ExternKernel.require_stride_order(weight, req_stride_order)
+
+    ordered_kwargs_for_cpp_kernel = [
+        "stride",
+        "padding",
+        "dilation",
+        "transposed",
+        "output_padding",
+        "groups",
+    ]
+    if bias is None:
+        args = [x, weight]
+        kwargs["bias"] = None  # type: ignore[typeddict-unknown-key]
+        ordered_kwargs_for_cpp_kernel.insert(0, "bias")
+    else:
+        args = [x, weight, bias]
+        bias.realize()
+        bias.freeze_layout()
+        V.graph.sizevars.evaluate_static_shapes(bias.get_size())
+
+    choices = []
+    if torch._inductor.utils._use_conv_autotune_backend("ATEN"):
+        choices = [
+            aten_convolution.bind(
+                args,
+                layout,
+                ordered_kwargs_for_cpp_kernel,
+                **kwargs,
+            )
+        ]
+
+    if (
+        torch._inductor.utils._use_conv_autotune_backend("TRITON")
+        and use_triton_template(layout)
+        # templates only support these:
+        and is_ones(dilation)
+        and not transposed
+        and is_zeros(output_padding)
+        # there are some odd models where this check fails (e.g. shufflenet_v2_x1_0)
+        and V.graph.sizevars.statically_known_equals(in_chan, x.get_size()[1])  # type: ignore[arg-type]
+    ):
+        if (
+            is_ones(kernel_shape)
+            and is_ones(stride)
+            and is_zeros(padding)
+            and groups == 1
+        ):
+            choices.append(aten_conv1x1_via_mm.bind(args, layout))
+
+        for cfg in conv_configs(
+            sympy_product([x.get_size()[0], *x.get_size()[2:]]),
+            out_chan,
+            in_chan,
+        ):
+            if ndim == 2:
+                conv2d_template.maybe_append_choice(
+                    choices,
+                    input_nodes=(x, weight),
+                    layout=layout,
+                    KERNEL_H=kernel_shape[0],
+                    KERNEL_W=kernel_shape[1],
+                    STRIDE_H=stride[0],
+                    STRIDE_W=stride[1],
+                    PADDING_H=padding[0],
+                    PADDING_W=padding[1],
+                    GROUPS=groups,
+                    # TODO(jansel): try unroll for bigger kernels once fixed:
+                    #               https://github.com/openai/triton/issues/1254
+                    UNROLL=is_ones(kernel_shape),
+                    ALLOW_TF32=torch.backends.cudnn.allow_tf32,
+                    num_stages=cfg.num_stages,
+                    num_warps=cfg.num_warps,
+                    **cfg.kwargs,
+                )
+            elif ndim == 3:
+                conv3d_template.maybe_append_choice(
+                    choices,
+                    input_nodes=(x, weight),
+                    layout=layout,
+                    KERNEL_D=kernel_shape[0],
+                    KERNEL_H=kernel_shape[1],
+                    KERNEL_W=kernel_shape[2],
+                    STRIDE_D=stride[0],
+                    STRIDE_H=stride[1],
+                    STRIDE_W=stride[2],
+                    PADDING_D=padding[0],
+                    PADDING_H=padding[1],
+                    PADDING_W=padding[2],
+                    GROUPS=groups,
+                    # TODO(jansel): try unroll for bigger kernels once fixed:
+                    #               https://github.com/openai/triton/issues/1254
+                    UNROLL=is_ones(kernel_shape),
+                    ALLOW_TF32=torch.backends.cudnn.allow_tf32,
+                    num_stages=cfg.num_stages,
+                    num_warps=cfg.num_warps,
+                    **cfg.kwargs,
+                )
+
+    return autotune_select_algorithm("convolution", choices, args, layout)
+
+
+@register_lowering(aten._convolution)
+def _convolution(
+    x,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+    benchmark,
+    deterministic,
+    cudnn_enabled,
+    allow_tf32,
+):
+    return convolution(
+        x, weight, bias, stride, padding, dilation, transposed, output_padding, groups
+    )
+
+
+def constrain_conv_to_fx_strides(fx_node, *args, **kwargs):
+    assert fx_node.target == torch.ops.aten.convolution.default
+    if V.graph.layout_opt:
+        return args, kwargs
+    else:
+        return constrain_to_fx_strides(fx_node, *args, **kwargs)
+
+
+add_layout_constraint(aten.convolution, constrain_conv_to_fx_strides)
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/flex_attention.py b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/flex_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..567c4c5fefbf8688b8f5e4b1eac247535c8edb32
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/flex_attention.py
@@ -0,0 +1,1843 @@
+# mypy: allow-untyped-defs
+""" Triton Implementation of the flex_attention Kernel"""
+
+import logging
+import math
+from typing import Any, List, Optional, Sequence, Tuple
+
+import sympy
+
+import torch
+from torch._inductor.virtualized import V
+from torch.utils._pytree import tree_map
+
+from .. import config
+from ..ir import (
+    ComputedBuffer,
+    ExternKernel,
+    FixedLayout,
+    FlexibleLayout,
+    get_stride_order,
+    InputBuffer,
+    IRNode,
+    StorageBox,
+    stride_order2fill_order,
+    Subgraph,
+    TensorBox,
+)
+from ..lowering import empty, empty_strided, lowerings, register_lowering
+from ..select_algorithm import autotune_select_algorithm, realize_inputs, TritonTemplate
+
+
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+Expr = sympy.Expr
+
+
+def construct_strides(
+    sizes: Sequence[int],
+    fill_order: Sequence[int],
+) -> Sequence[int]:
+    """From a list of sizes and a fill order, construct the strides of the permuted tensor."""
+    # Initialize strides
+    assert len(sizes) == len(
+        fill_order
+    ), "Length of sizes must match the length of the fill order"
+    strides = [0] * len(sizes)
+
+    # Start with stride 1 for the innermost dimension
+    current_stride = 1
+
+    # Iterate through the fill order populating strides
+    for dim in fill_order:
+        strides[dim] = current_stride
+        current_stride *= sizes[dim]
+
+    return strides
+
+
+def flex_attention_grid(batch_size, q_heads, num_queries, d_model, meta):
+    """How is this kernel parallelized?
+    We create a grid of (batch_size * num_heads, ceil_div(n_queries, query_block_size), 1)
+    Each block is responsible for iterating over blocks of keys and values calculating
+    the final attention output.
+    """
+    import triton
+
+    return (triton.cdiv(num_queries, meta["BLOCK_M"]), batch_size * q_heads, 1)
+
+
+def create_placeholder(
+    name: str, dtype: torch.dtype, device: torch.device
+) -> TensorBox:
+    """Creates a placeholder input buffers for producing subgraph_output."""
+    input_buffer = InputBuffer(name, FixedLayout(device, dtype, [], []))
+    return TensorBox.create(input_buffer)
+
+
+def maybe_realize(args: List[Optional[IRNode]]):
+    """Accepts a list of optional IRNodes and returns a list of realized IRNodes"""
+    return tree_map(lambda x: realize_inputs(x) if x is not None else None, args)
+
+
+def get_float32_precision():
+    if torch.get_float32_matmul_precision() == "highest" or torch.version.hip:
+        return "'ieee'"
+    else:
+        return "'tf32'"
+
+
+def build_subgraph_buffer(
+    args: List[TensorBox],
+    subgraph: Subgraph,
+):
+    """This function's goal is to take in the required args and produce the subgraph buffer
+    The subgraph buffer is a ComputedBuffer that will be inlined into the triton template
+
+    Args:
+        args: The args that are passed into the subgraph. Contains both fixed and lifted inputs.
+        subgraph: The Subgraph ir for which to produce the output node
+    """
+    cnt = 0
+    env = {}
+    for node in subgraph.graph_module.graph.nodes:
+        # There are two classes of placeholder inpts that we need
+        # to handle differently. For the first n_scalar_inps inputs
+        # we expect that these placeholders were generated by the make_fx call
+        # in the flex Attention HOP. So we need to create a new placeholder
+        # TensorBox for each of these inputs. For the rest of the inputs we
+        # expect that these are lifted inputs that fill up the '*other_buffers'
+        # tuple and already have corresponding TensorBoxes passed in as args.
+        if node.op == "placeholder":
+            env[node] = args[cnt]
+            cnt += 1
+        elif node.op == "call_function":
+            # For call_function we use the default lowerings and pass in the
+            # already created TensorBoxes as args
+
+            args, kwargs = tree_map(
+                lambda x: env[x] if x in env else x, (node.args, node.kwargs)
+            )
+            env[node] = lowerings[node.target](*args, **kwargs)
+        elif node.op == "output":
+
+            def convert_output_node_to_buffer(output):
+                if output is None:
+                    return None
+                output_node = output
+                output_buffer = env[output_node]
+                assert isinstance(output_buffer, TensorBox), (
+                    "The output node  for flex attention's subgraph must be a TensorBox, but got: ",
+                    type(output_buffer),
+                )
+                assert isinstance(output_buffer.data, StorageBox), (
+                    "The output node for the flex attention subgraph must be a StorageBox, but got: ",
+                    type(output_buffer),
+                )
+                subgraph_buffer = ComputedBuffer(
+                    name=None,
+                    layout=FlexibleLayout(
+                        device=output_buffer.data.get_device(),
+                        dtype=output_buffer.data.get_dtype(),
+                        size=output_buffer.data.get_size(),
+                    ),
+                    data=output_buffer.data.data,  # type: ignore[arg-type]
+                )
+                return subgraph_buffer
+
+            # node.args[0] is either a single element or a list of elements
+            # representing all outputs of the function.
+            return tree_map(convert_output_node_to_buffer, node.args[0])
+
+    raise ValueError("FlexAttention was passed a subgraph with no output node!")
+
+
+# Inner Triton functions shared by flex_attention & split-k decoding kernels.
+compute_next_offset_func = r"""
+@triton.jit
+def get_offset_for_next_block(loop_iter, col_indices, total_blocks, SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK):
+    cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE
+    cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last")
+    next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks)
+    needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0
+    jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK
+
+    offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK
+    return offset
+"""
+
+compute_flex_attention = r"""
+{{def_kernel("Q", "K", "V", "LSE", "KV_NUM_BLKS", "KV_IDX", "FULL_KV_NUM_BLKS", "FULL_KV_IDX")}}
+    # Sub notation for this kernel:
+    #
+    # Q: Query, K: Key, V: Value
+    # M: Number of queries, N: Number of keys/values, D: Model dimension
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    #
+    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
+    #
+    # OUTPUT_LOGSUMEXP: We only need to store the logsumexp if we require grad
+    #
+    # (Modifiable) Performance tuning options
+    # BLOCK_M: The thread block size across the seqlen dim of Q.
+    # BLOCK_N: Iterate over BLOCK_N across the seqlen dim of K/V in each thread block.
+
+    # The below are kernel options that can be applied for certain score_mods,
+    # or involve a numerics vs. perf tradeoff
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
+    # about 20% more numerical error, but slightly faster.
+    # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row
+    # is not masked out? If so, we can skip an extra safety check
+
+    tl.static_assert(SPARSE_Q_BLOCK_SIZE >= BLOCK_M and SPARSE_Q_BLOCK_SIZE % BLOCK_M == 0)
+    tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0)
+
+    # Define strides of inputs
+    stride_qz, stride_qh, stride_qm, stride_qk = {{stride("Q")}}
+    stride_kz, stride_kh, stride_kn, stride_kk = {{stride("K")}}
+    stride_vz, stride_vh, stride_vn, stride_vk = {{stride("V")}}
+
+    Z = {{size("Q", 0)}}
+    HQ = {{size("Q", 1)}}
+    Q_LEN = {{size("Q", 2)}}
+    KV_LEN = {{size("K", 2)}}
+
+    MATMUL_PRECISION = Q.dtype.element_ty
+
+    q_start = tl.program_id(0)
+    off_z = tl.program_id(1) // HQ
+    off_hq = tl.program_id(1) % HQ
+    off_hkv = off_hq // GQA_SHARED_HEADS
+    off_g = off_hq % GQA_SHARED_HEADS
+
+    q_offset = off_z * stride_qz + off_hq * stride_qh
+    k_offset = off_z * stride_kz + off_hkv * stride_kh
+    v_offset = off_z * stride_vz + off_hkv * stride_vh
+
+    Q = Q + q_offset
+    K = K + k_offset
+    V = V + v_offset
+
+    SPARSE_Z = {{size("KV_NUM_BLKS", 0)}}
+    SPARSE_HQ = {{size("KV_NUM_BLKS", 1)}}
+
+    sparse_idx_z = off_z % SPARSE_Z
+    sparse_idx_hq = off_hq % SPARSE_HQ
+
+    SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M)
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+
+    SPARSE_Q_BLOCK_CNT: tl.constexpr = tl.cdiv(Q_LEN, SPARSE_Q_BLOCK_SIZE)
+    SPARSE_KV_BLOCK_CNT: tl.constexpr = tl.cdiv(KV_LEN, SPARSE_KV_BLOCK_SIZE)
+
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, V_HEAD_DIM], dtype=tl.float32)
+
+    offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M)
+
+    # KV_IDX and KV_NUM_BLKS are always contiguous.
+    sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq
+    sparse_kv_num_blks_offset = sparse_hz_offset * SPARSE_Q_BLOCK_CNT + q_start // SPARSE_Q_MULTIPLE
+    sparse_kv_idx_offset = sparse_hz_offset * SPARSE_Q_BLOCK_CNT * SPARSE_KV_BLOCK_CNT + (q_start // SPARSE_Q_MULTIPLE) * SPARSE_KV_BLOCK_CNT  # noqa: B950
+
+    Q_block_ptr = tl.make_block_ptr(
+        base=Q,
+        shape=(Q_LEN, QK_HEAD_DIM),
+        strides=(stride_qm, stride_qk),
+        offsets=(q_start * BLOCK_M, 0),
+        block_shape=(BLOCK_M, QK_HEAD_DIM),
+        order=(1, 0)
+    )
+
+    # load q: it stays in SRAM throughout the inner loop.
+    if IS_DIVISIBLE:
+        q = tl.load(Q_block_ptr)
+    else:
+        # boundary check is not free, so we only do it when necessary.
+        q = tl.load(Q_block_ptr, boundary_check=(0,), padding_option = "zero")
+
+    # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We don't know anything "special" about these blocks, so we need to apply
+    # both score_mod and mask_mod to it
+    kv_indices = KV_IDX + sparse_kv_idx_offset
+    kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+    kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
+    block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+
+    K_block_ptr = tl.make_block_ptr(
+        base=K,
+        shape=(QK_HEAD_DIM, KV_LEN),
+        strides=(stride_kk, stride_kn),
+        offsets=(0, kv_start),
+        block_shape=(QK_HEAD_DIM, BLOCK_N),
+        order=(0, 1)
+    )
+    V_block_ptr = tl.make_block_ptr(
+        base=V,
+        shape=(KV_LEN, V_HEAD_DIM),
+        strides=(stride_vn, stride_vk),
+        offsets=(kv_start, 0),
+        block_shape=(BLOCK_N, V_HEAD_DIM),
+        order=(1, 0)
+    )
+    offs_n = kv_start + tl.arange(0, BLOCK_N)
+
+    acc, l_i, m_i = forward_inner(
+        {{gen_argdefs()}},
+        q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+        acc, l_i, m_i,
+        off_z, off_hq, offs_m[:, None], offs_n[None, :],
+        kv_indices, kv_num_blocks,
+        0, block_n_end,
+        MATMUL_PRECISION,
+        IS_FULL_BLOCKS=False,
+    )
+
+    # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We know these blocks are guaranteed to be "full", so we don't need to
+    # apply mask_mod to them - only score_mod
+    if HAS_FULL_BLOCKS:
+        # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
+        kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
+        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+        kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
+        block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+
+        K_block_ptr = tl.make_block_ptr(
+            base=K,
+            shape=(QK_HEAD_DIM, KV_LEN),
+            strides=(stride_kk, stride_kn),
+            offsets=(0, kv_start),
+            block_shape=(QK_HEAD_DIM, BLOCK_N),
+            order=(0, 1)
+        )
+        V_block_ptr = tl.make_block_ptr(
+            base=V,
+            shape=(KV_LEN, V_HEAD_DIM),
+            strides=(stride_vn, stride_vk),
+            offsets=(kv_start, 0),
+            block_shape=(BLOCK_N, V_HEAD_DIM),
+            order=(1, 0)
+        )
+        offs_n = kv_start + tl.arange(0, BLOCK_N)
+
+        acc, l_i, m_i = forward_inner(
+            {{gen_argdefs()}},
+            q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+            acc, l_i, m_i,
+            off_z, off_hq, offs_m[:, None], offs_n[None, :],
+            kv_indices, kv_num_blocks,
+            0, block_n_end,
+            MATMUL_PRECISION,
+            IS_FULL_BLOCKS=True,
+        )
+
+
+    # [Note] Handle fully masked out rows:
+    # Li will be the sum(e^(-inf)) == 0.0 for masked out rows, mi will be -inf.
+    # We set Li to 1.0 which will result in lse/out = 0.0 | after the log(li) + mi(0.0) step
+    l_i = tl.where(l_i == 0.0, 1, l_i)
+
+    acc = acc / l_i[:, None]
+    idx_z = tl.program_id(1) // HQ
+    idx_hq = tl.program_id(1) % HQ
+    idx_m = offs_m[:, None]
+    idx_d = tl.arange(0, V_HEAD_DIM)[None, :]
+
+    mask = idx_m < Q_LEN
+    # TODO generalize and add proper mask support
+    {{store_output(("idx_z", "idx_hq", "idx_m", "idx_d"), "acc", "mask")}}
+
+    # TODO dont want to write this if we dont require grad
+    if OUTPUT_LOGSUMEXP:
+        off_hz = tl.program_id(1)
+        l_ptrs = LSE + off_hz * Q_LEN + offs_m
+        lse = m_i + tl.math.log2(l_i)
+        if IS_DIVISIBLE:
+            tl.store(l_ptrs, lse)
+        else:
+            tl.store(l_ptrs, lse, mask=offs_m < Q_LEN)
+ """
+
+
+compute_forward_inner = r"""
+@triton.jit
+def forward_inner(
+    {{gen_argdefs()}},
+    q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets used as inputs to score_mod & mask_mod
+    # of size [BLOCK_M, BLOCK_N] or scalar.
+    off_z, off_h, offs_m, offs_n,
+    # blocksparse data
+    kv_indices, kv_num_blocks,
+    # start kv and end kv block
+    block_n_start, block_n_end,
+    MATMUL_PRECISION,
+    IS_FULL_BLOCKS,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    {{gen_defines() | indent_except_first(1)}}
+
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    RCP_LN2: tl.constexpr = 1.44269504
+
+    if PRESCALE_QK:
+        q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+
+    # loop over k, v and update accumulator until block_n_end
+    for start_n in range(block_n_start, block_n_end):
+        if IS_DIVISIBLE:
+            acc, l_i, m_i = forward_block_mn(
+                {{gen_argdefs()}},
+                q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS,
+            )
+        else:
+            # Benchmark shows even we applied mod & mask to each block for non divisible seqlen,
+            # it's on par or slightly faster than only applying to the last block in fwd.
+            # However, we choose different strategy for bwd, where we only apply mod & mask
+            # to the last block because it's faster a lot.
+            acc, l_i, m_i = forward_block_mn(
+                {{gen_argdefs()}},
+                q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
+            )
+
+        # update pointers
+        offset = get_offset_for_next_block(
+            start_n, kv_indices, kv_num_blocks,
+            SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N
+        )
+
+        V_block_ptr = tl.advance(V_block_ptr, (offset, 0))
+        K_block_ptr = tl.advance(K_block_ptr, (0, offset))
+
+        offs_n = offs_n + offset
+
+    return acc, l_i, m_i
+
+"""
+
+
+compute_forward_block_mn = r"""
+@triton.jit
+def forward_block_mn(
+    {{gen_argdefs()}},
+    q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets
+    off_z, off_h, offs_m, offs_n,
+    MATMUL_PRECISION, RCP_LN2,
+    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    {{gen_defines() | indent_except_first(1)}}
+
+    # -- load k --
+    if IS_DIVISIBLE:
+        k = tl.load(K_block_ptr)
+    else:
+        k = tl.load(K_block_ptr, boundary_check=(1,), padding_option = "zero")
+    # -- compute qk ---
+    qk = tl.dot(q, k, input_precision=FLOAT32_PRECISION) # TODO: use cuda matmul when q_len <= 2.
+    if not PRESCALE_QK:
+        qk *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    if CHECK_BLOCK_BOUNDARY:
+        # If this is the last block of a non divisible seqlen, we still need to load [BLOCK_M, BLOCK_N] elements,
+        # which is larger than the actual number of elements. To avoid access memory out of bound,
+        # we need to mask out the elements that are out of Q_LEN & KV_LEN.
+        m = offs_m % Q_LEN
+        n = offs_n % KV_LEN
+    else:
+        m = offs_m
+        n = offs_n
+
+    {{ modification(
+        subgraph_number=0,
+        output_name="post_mod_scores",
+        score="qk",
+        b="off_z",
+        h="off_h",
+        m="m",
+        n="n",
+        out="qk"
+    ) | indent_except_first(1) }}
+
+    if CHECK_BLOCK_BOUNDARY:
+        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+        post_mod_scores = tl.where(offs_n < KV_LEN, post_mod_scores, float("-inf"))
+
+    if not IS_FULL_BLOCKS:
+        {{ modification(
+            subgraph_number=1,
+            output_name="mask_mod_output",
+            score="qk",
+            b="off_z",
+            h="off_h",
+            m="m",
+            n="n",
+        ) | indent_except_first(2) }}
+
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n < KV_LEN, mask_mod_output, float("-inf"))
+        # apply mask for partially unmasked blocks
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+
+    # TODO: In the case that score_mod is linear, this can be LICMed
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    # -- compute scaling constant ---
+    m_ij = tl.maximum(m_i, tl.max(post_mod_scores, 1))
+    if not ROWS_GUARANTEED_SAFE:
+        masked_out_rows = (m_ij == float("-inf"))
+        m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
+    else:
+        m_ij_masked = m_ij
+
+    alpha = tl.math.exp2(m_i - m_ij_masked)
+    p = tl.math.exp2(post_mod_scores - m_ij_masked[:, None])
+
+    # NB: l_i update is pulled up here since it's a bit faster
+    # NB: For headdim=256, it's faster to move it back down to after m_i =
+    # m_ij
+    l_i = l_i * alpha + tl.sum(p, 1)
+    # # -- scale and update acc --
+    acc = acc * alpha[:, None]
+
+    if IS_DIVISIBLE:
+        v = tl.load(V_block_ptr)
+    else:
+        v = tl.load(V_block_ptr, boundary_check=(0,), padding_option = "zero")
+    acc = tl.dot(p.to(MATMUL_PRECISION), v, acc, input_precision=FLOAT32_PRECISION)
+
+    # -- update m_i
+    m_i = m_ij
+
+    return acc, l_i, m_i
+
+"""
+
+
+flex_attention_template = TritonTemplate(
+    name="flex_attention",
+    grid=flex_attention_grid,
+    source=compute_flex_attention
+    + compute_forward_inner
+    + compute_next_offset_func
+    + compute_forward_block_mn,
+)
+
+
+def _use_flex_decoding(query, kernel_options):
+    # Decide which kernel to use, return true if use flex decoding kernel.
+    return (
+        not kernel_options.get("FORCE_USE_FLEX_ATTENTION", False)
+    ) and V.graph.sizevars.evaluate_expr(sympy.Lt(query.get_size()[-2], 128))
+
+
+_h100_default_config = {
+    (torch.float32, 64): (128, 32, 4, 3),
+    (torch.float32, 128): (32, 64, 4, 3),
+    (torch.float32, 256): (32, 32, 4, 3),
+    (torch.bfloat16, 64): (128, 128, 4, 3),
+    (torch.bfloat16, 128): (128, 64, 8, 3),
+    (torch.bfloat16, 256): (64, 32, 4, 3),
+    (torch.float16, 64): (128, 128, 4, 3),
+    (torch.float16, 128): (128, 128, 8, 3),
+    (torch.float16, 256): (64, 32, 4, 3),
+}
+
+_a100_default_config = {
+    (torch.float32, 64): (128, 32, 4, 3),
+    (torch.float32, 128): (128, 32, 4, 3),
+    (torch.float32, 256): (64, 16, 4, 3),
+    (torch.bfloat16, 64): (128, 64, 4, 3),
+    (torch.bfloat16, 128): (128, 64, 8, 3),
+    (torch.bfloat16, 256): (32, 64, 4, 3),
+    (torch.float16, 64): (128, 64, 4, 3),
+    (torch.float16, 128): (128, 64, 8, 3),
+    (torch.float16, 256): (32, 64, 4, 3),
+}
+
+
+def _get_default_config_fwd(query) -> Tuple[int, int, int, int]:
+    dtype = query.get_dtype()
+    head_dim = query.get_size()[-1]
+    default_config = None
+
+    if head_dim <= 256 and torch.cuda.get_device_capability() >= (9, 0):  # H100
+        if dtype == torch.float32:
+            default_config = (64, 64, 4, 3)
+        else:
+            default_config = (128, 64, 4, 3)
+        default_config = _h100_default_config.get((dtype, head_dim), default_config)
+    elif head_dim <= 256 and torch.cuda.get_device_capability() >= (8, 0):  # A100
+        if dtype == torch.float32:
+            default_config = (64, 64, 4, 3)
+        else:
+            default_config = (128, 64, 4, 3)
+        default_config = _a100_default_config.get((dtype, head_dim), default_config)
+    else:  # modest hardware or extremely large head_dim
+        if dtype == torch.float32:
+            default_config = (32, 16, 4, 3)
+        else:
+            default_config = (64, 32, 4, 3)
+
+    return default_config
+
+
+def _get_default_config_bwd(query) -> Tuple[int, int, int, int]:
+    head_dim = query.get_size()[-1]
+    dtype = query.get_dtype()
+
+    if dtype == torch.float32:
+        return (16, 16, 4, 1)
+    if head_dim <= 256 and torch.cuda.get_device_capability() >= (9, 0):  # H100
+        if head_dim == 64:
+            return (64, 64, 4, 3)
+        elif head_dim == 128:
+            return (64, 128, 8, 3)
+        else:
+            return (64, 64, 4, 2)
+    elif torch.cuda.get_device_capability() >= (8, 0):  # A100
+        if head_dim == 64:
+            return (32, 128, 4, 3)
+        elif head_dim == 128:
+            return (64, 128, 8, 3)
+        else:
+            return (64, 64, 4, 2)
+    else:  # modest hardware or extremely large head_dim
+        return (16, 16, 4, 1)
+
+
+def create_num_blocks_fake_generator(sparse_indices):
+    # The idea here is that we need to create a real tensor with real data
+    # that's representative for benchmarking.
+    # For example, returning all zeros for the `kv_num_blocks` input would mean
+    # that we are computing 0 blocks for each row, which would provide bogus
+    # autotuning results.
+    #
+    # In this case, we choose to use min(16, max_block) blocks, because I
+    # (Horace) think it'll probably result in pretty representative performance.
+    # If it's too short then prefetching won't help. If it's too long then
+    # autotuning will take longer for no good reason.
+    def create_num_blocks_fake(x) -> torch.Tensor:
+        num_blocks_for_autotuning = min(16, sparse_indices.shape[-1])
+        return torch.full(
+            x.get_size(),
+            int(num_blocks_for_autotuning),
+            dtype=x.get_dtype(),
+            device=x.get_device(),
+        )
+
+    return create_num_blocks_fake
+
+
+def create_indices_fake(x) -> torch.Tensor:
+    indices = torch.arange(
+        0, int(x.get_size()[-1]), dtype=x.get_dtype(), device=x.get_device()
+    )
+    indices = indices.expand(x.get_size()).contiguous()
+    return indices
+
+
+from torch._inductor.kernel.flex_decoding import create_flex_decoding_kernel
+
+
+# TODO: We probably also need a layout constraint?
+@register_lowering(torch.ops.higher_order.flex_attention, type_promotion_kind=None)
+def flex_attention(
+    query,
+    key,
+    value,
+    subgraph,
+    block_mask,
+    scale,
+    kernel_options,
+    score_mod_other_buffers,
+    mask_mod_other_buffers,
+):
+    (
+        kv_num_blocks,
+        kv_indices,
+        full_kv_num_blocks,
+        full_kv_indices,
+        q_num_blocks,
+        q_indices,
+        full_q_num_blocks,
+        full_q_indices,
+        SPARSE_KV_BLOCK_SIZE,
+        SPARSE_Q_BLOCK_SIZE,
+        mask_graph,
+    ) = block_mask
+    placeholder_inps = [
+        create_placeholder(name, dtype, query.get_device())
+        for name, dtype in [
+            ("score", query.get_dtype()),
+            ("b", torch.int32),
+            ("h", torch.int32),
+            ("m", torch.int32),
+            ("n", torch.int32),
+        ]
+    ]
+    subgraph_buffer = build_subgraph_buffer(
+        placeholder_inps + list(score_mod_other_buffers), subgraph
+    )
+    mask_graph_placeholder_inps = [
+        create_placeholder(name, dtype, query.get_device())
+        for name, dtype in [
+            ("b", torch.int32),
+            ("h", torch.int32),
+            ("m", torch.int32),
+            ("n", torch.int32),
+        ]
+    ]
+    mask_graph_buffer = build_subgraph_buffer(
+        mask_graph_placeholder_inps + list(mask_mod_other_buffers), mask_graph
+    )
+    kernel_options = dict(kernel_options)
+    kernel_options.setdefault("FLOAT32_PRECISION", get_float32_precision())
+    if _use_flex_decoding(query, kernel_options):
+        return create_flex_decoding_kernel(
+            query,
+            key,
+            value,
+            block_mask,
+            scale,
+            kernel_options,
+            subgraph_buffer,
+            mask_graph_buffer,
+            score_mod_other_buffers,
+            mask_mod_other_buffers,
+        )
+
+    (
+        query,
+        key,
+        value,
+        kv_num_blocks,
+        kv_indices,
+        full_kv_num_blocks,
+        full_kv_indices,
+        q_num_blocks,
+        q_indices,
+        full_q_num_blocks,
+        full_q_indices,
+    ) = maybe_realize(
+        [
+            query,
+            key,
+            value,
+            kv_num_blocks,
+            kv_indices,
+            full_kv_num_blocks,
+            full_kv_indices,
+            q_num_blocks,
+            q_indices,
+            full_q_num_blocks,
+            full_q_indices,
+        ]
+    )
+
+    Bq, Hq, seq_len_q, qk_head_dim = query.get_size()
+    Bkv, Hkv, seq_len_kv, v_head_dim = value.get_size()
+    assert Bq == Bkv, "Batch dimension must match"
+    B = Bq
+
+    if seq_len_q % 128 != 0 or seq_len_kv % 128 != 0:
+        kernel_options.setdefault("IS_DIVISIBLE", False)
+    else:
+        kernel_options.setdefault("IS_DIVISIBLE", True)
+
+    # Reuse query strides for output layout despite different last dimension.
+    # This works because only the last dim differs and we check it is contiguous.
+    q_strides = query.get_stride()
+    assert q_strides[-1] == 1, "Query must be contiguous in the last dimension"
+
+    # Construct output layout with strides matching the query.
+    out_size = [B, Hq, seq_len_q, v_head_dim]
+    stride_order = get_stride_order(query.get_stride())
+    fill_order = stride_order2fill_order(stride_order)
+    out_strides = construct_strides(out_size, fill_order)
+
+    layout = FixedLayout(
+        query.get_device(),
+        query.get_dtype(),
+        [B, Hq, seq_len_q, v_head_dim],
+        stride=out_strides,
+    )
+    # see NOTE:[TritonTemplates with multiple outputs]
+    logsumexp_shape = [B, Hq, seq_len_q]
+    logsumexp = empty_strided(
+        logsumexp_shape,
+        None,
+        dtype=torch.float32,  # The logsumexp is always stored in fp32 regardless of the input dtype
+        device=query.get_device(),
+    )
+    kernel_options.setdefault("SM_SCALE", scale)
+
+    # Determine GQA broadcast factor.
+    gqa_shared_heads = Hq // Hkv
+    kernel_options.setdefault("GQA_SHARED_HEADS", gqa_shared_heads)
+
+    # Inside of Triton kernel, only apply partial masking if partial blocks are computed.
+    # full_kv_num_blocks is None if partial blocks are not computed
+    has_full_blocks = full_kv_num_blocks is not None
+    kernel_options.setdefault("HAS_FULL_BLOCKS", has_full_blocks)
+    if not has_full_blocks:
+        full_kv_num_blocks, full_kv_indices = (
+            empty(0, device=query.get_device()) for _ in range(2)
+        )
+    kernel_options.setdefault("QK_HEAD_DIM", qk_head_dim)
+    kernel_options.setdefault("V_HEAD_DIM", v_head_dim)
+
+    choices: List[Any] = []
+    configs: List[Tuple[int, int, int, int]] = []
+    configs.append(_get_default_config_fwd(query))
+    if config.max_autotune:
+        configs += [
+            (128, 64, 4, 3),
+            (128, 128, 4, 3),
+            (128, 128, 8, 2),
+            (64, 128, 4, 3),
+            (64, 64, 4, 3),
+        ]
+
+    # Note, we don't need to pass in the captured buffers explicitly
+    # because they're implicitly added by the score_mod function
+    # We do need to explicitly pass it in for autotuning though.
+
+    for BLOCK_M, BLOCK_N, num_warps, num_stages in configs:
+        if SPARSE_KV_BLOCK_SIZE % BLOCK_N != 0 or SPARSE_Q_BLOCK_SIZE % BLOCK_M != 0:
+            continue
+        # Work around https://github.com/pytorch/pytorch/issues/129625
+        if num_stages == 2:
+            continue
+
+        # Performance tuning
+        kernel_options.setdefault("BLOCK_M", BLOCK_M)
+        kernel_options.setdefault("BLOCK_N", BLOCK_N)
+        # Blocksparse options
+        kernel_options.setdefault("SPARSE_Q_BLOCK_SIZE", SPARSE_Q_BLOCK_SIZE)
+        kernel_options.setdefault("SPARSE_KV_BLOCK_SIZE", SPARSE_KV_BLOCK_SIZE)
+
+        flex_attention_template.maybe_append_choice(
+            choices=choices,
+            input_nodes=[
+                query,
+                key,
+                value,
+                logsumexp,
+                kv_num_blocks,
+                kv_indices,
+                full_kv_num_blocks,
+                full_kv_indices,
+            ],
+            layout=layout,
+            subgraphs=[
+                subgraph_buffer,
+                mask_graph_buffer,
+            ],
+            mutated_inputs=[
+                logsumexp,
+            ],
+            num_stages=num_stages,
+            num_warps=num_warps,
+            call_sizes=query.get_size(),
+            **kernel_options,
+        )
+    inputs_for_autotuning = (
+        [
+            query,
+            key,
+            value,
+            logsumexp,
+            kv_num_blocks,
+            kv_indices,
+            full_kv_num_blocks,
+            full_kv_indices,
+        ]
+        + list(score_mod_other_buffers)
+        + list(mask_mod_other_buffers)
+    )
+    input_gen_fns = {
+        4: create_num_blocks_fake_generator(kv_indices),
+        5: create_indices_fake,
+        6: create_num_blocks_fake_generator(full_kv_indices),
+        7: create_indices_fake,
+    }
+    return (
+        autotune_select_algorithm(
+            "flex_attention",
+            choices,
+            inputs_for_autotuning,
+            layout,
+            input_gen_fns=input_gen_fns,
+        ),
+        logsumexp,
+    )
+
+
+# ---------------------------- Backward HOP Implementation ----------------------------
+
+
+def flex_attention_backward_grid(
+    batch_size, q_heads, num_queries, d_model, kv_heads, num_key_value, meta
+):
+    """How is this kernel parallelized?
+    Currently this is only parallelizing over batch* kv_heads, but we can, and want to
+    parallelize over ceil_div(q_heads//kv_heads * num_key_value, key_value_block_size).
+    To do this will either require atomic updates to some grad values or to have a two pass kernel design.
+    """
+    import triton
+
+    return (
+        triton.cdiv(num_queries, meta["BLOCK_M2"]) * (q_heads // kv_heads)
+        + triton.cdiv(num_key_value, meta["BLOCK_N1"]),
+        1,
+        batch_size * kv_heads,
+    )
+
+
+flex_attention_backward_template = TritonTemplate(
+    name="flex_attention_backward",
+    grid=flex_attention_backward_grid,
+    source=r"""
+{{def_kernel("Q", "K", "V", "LSE", "DELTA", "DO", "DQ", "DV", "KV_NUM_BLKS", "KV_IDX", "Q_NUM_BLKS", "Q_IDX", "FULL_KV_NUM_BLKS", "FULL_KV_IDX", "FULL_Q_NUM_BLKS", "FULL_Q_IDX")}}
+    # Sub notation for this kernel:
+    #
+    # Q: Query, K: Key, V: Value
+    # LSE: logsumexp (logsumexp is always stored in fp32 regardless of the input dtype)
+    # DELTA: Precomputed sum(OUT*DO, axis=-1)
+    # DO: Derivative of Output, DQ: Derivative of Query, DV: Derivative of Value
+    # DK: Derivative of Key, is the written to via the store_output call due to some limitations with
+    # inductor codegen
+    # M: Number of queries, N: Number of keys/values
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # z: Batch size, h: Number of heads, m: Number of queries or keys/values, d: Head dim
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    # (Modifiable) Performance tuning options
+    # BLOCK_M1: when calculating DK & DV, iterate over BLOCK_M1 across the seqlen dim of Q in each thread block.
+    # BLOCK_N1: when calculating DK & DV, the thread block size across the seqlen dim of K/V.
+    # BLOCK_M2: when calculating DQ, the thread block size across the seqlen dim of Q.
+    # BLOCK_N2: when calculating DQ, iterate over BLOCK_N2 across the seqlen dim of K/V in each thread block.
+    #
+    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    # Q_NUM_BLKS: The number of Q blocks (that may or may not require masking) for each query.
+    # Q_IDX: The indices of Q blocks (that may or may not require masking) for each query.
+    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_Q_NUM_BLKS: The number of fully unmasked Q blocks (so we don't need masking) for each query.
+    # FULL_Q_IDX: The indices of fully unmasked Q blocks (so we don't need masking) for each query.
+
+    # The below are kernel options that can be applied for certain score_mods,
+    # or involve a numerics vs. perf tradeoff
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
+    # about 20% more numerical error, but slightly faster.
+
+    # Define strides of inputs
+    stride_qz, stride_qh, stride_qm, stride_qd = {{stride("Q")}}
+    stride_kz, stride_kh, stride_kn, stride_kd = {{stride("K")}}
+    stride_vz, stride_vh, stride_vn, stride_vd = {{stride("V")}}
+    stride_doz, stride_doh, stride_dom, stride_dod = {{stride("DO")}}
+
+    stride_dqz, stride_dqh, stride_dqm, stride_dqd = {{stride("DQ")}}
+    stride_dvz, stride_dvh, stride_dvm, stride_dvd = {{stride("DV")}}
+
+    Z = {{size("Q", 0)}}
+    HQ = {{size("Q", 1)}}
+    HKV = {{size("K", 1)}}
+    Q_LEN = {{size("Q", 2)}}
+    KV_LEN = {{size("K", 2)}}
+
+    MATMUL_PRECISION = Q.dtype.element_ty
+
+    pid = tl.program_id(0)
+    NUM_KV_BLOCKS = tl.cdiv(KV_LEN, BLOCK_N1)
+    NUM_Q_BLOCKS = tl.cdiv(Q_LEN, BLOCK_M2)
+
+    off_hz = tl.program_id(2)
+    off_z = off_hz // HKV # batch idx
+    off_hkv = off_hz % HKV # kv head idx
+
+    SPARSE_Z = {{size("KV_NUM_BLKS", 0)}}
+    SPARSE_HQ = {{size("KV_NUM_BLKS", 1)}}
+
+    sparse_idx_z = off_z % SPARSE_Z
+
+    k_adj = (stride_kh * off_hkv + stride_kz * off_z).to(tl.int64)
+    v_adj = (stride_vh * off_hkv + stride_vz * off_z).to(tl.int64)
+    dv_adj = (stride_dvh * off_hkv + stride_dvz * off_z).to(tl.int64)
+
+    # offset K, V, DV pointers for batch/kv-head
+    K += k_adj
+    V += v_adj
+    DV += dv_adj
+
+    RCP_LN2 = 1.44269504
+    offs_k = tl.arange(0, QK_HEAD_DIM)
+    offs_v = tl.arange(0, V_HEAD_DIM)
+
+    if pid >= NUM_KV_BLOCKS:
+        off_pid = pid - NUM_KV_BLOCKS
+        # THIS BLOCK DOES DQ
+        SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M2)
+        SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2)
+        off_hq2 = off_pid // NUM_Q_BLOCKS + off_hkv * GQA_SHARED_HEADS
+        start_m2_block = off_pid % NUM_Q_BLOCKS
+        off_pid_mask = start_m2_block // SPARSE_Q_MULTIPLE
+        stride_kv_num_blks_h = {{stride("KV_NUM_BLKS", 1)}}
+        stride_kv_idx_h = {{stride("KV_IDX", 1)}}
+        stride_kv_idx_m = {{stride("KV_IDX", 2)}}
+
+        sparse_idx_hq2 = off_hq2 % SPARSE_HQ
+        sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq2
+
+        sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + off_pid_mask
+        sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + off_pid_mask * stride_kv_idx_m  # noqa: B950
+
+        # Offset Q, DQ, DO, DELTA & LSE. These inputs are offseted by query heads.
+        q_adj2 = (stride_qh * off_hq2 + stride_qz * off_z).to(tl.int64)
+        do_adj2 = (stride_doh * off_hq2 + stride_doz * off_z).to(tl.int64)
+        dq_adj2 = (stride_dqh * off_hq2 + stride_dqz * off_z).to(tl.int64)
+        off_chz2 = ((off_z * HQ + off_hq2) * Q_LEN).to(tl.int64)
+
+        Q2 = Q + q_adj2
+        DO2 = DO + do_adj2
+        # TODO: This does not work if DQ is not the same layout as Q (for example,
+        # if Q is broadcasted)
+        DQ2 = DQ + dq_adj2
+        LSE2 = LSE + off_chz2
+        DELTA2 = DELTA + off_chz2
+
+        dq = tl.zeros([BLOCK_M2, QK_HEAD_DIM], dtype=tl.float32)
+
+        start_m2 = start_m2_block * BLOCK_M2
+        offs_m2 = start_m2 + tl.arange(0, BLOCK_M2)
+
+        # load Q and do: they stay in SRAM throughout the inner loop.
+        if IS_DIVISIBLE:
+            q = tl.load(Q2 + offs_m2[:, None] * stride_qm + offs_k[None, :] * stride_qd)
+            do = tl.load(DO2 + offs_m2[:, None] * stride_dom + offs_v[None, :] * stride_dod)
+        else:
+            q = tl.load(Q2 + offs_m2[:, None] * stride_qm + offs_k[None, :] * stride_qd, mask=offs_m2[:, None] < Q_LEN)
+            do = tl.load(DO2 + offs_m2[:, None] * stride_dom + offs_v[None, :] * stride_dod, mask=offs_m2[:, None] < Q_LEN)
+
+        if PRESCALE_QK:
+            q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+
+        if IS_DIVISIBLE:
+            Di = tl.load(DELTA2 + offs_m2)
+            lse = tl.load(LSE2 + offs_m2)
+        else:
+            Di = tl.load(DELTA2 + offs_m2, mask=offs_m2 < Q_LEN)
+            lse = tl.load(LSE2 + offs_m2, mask=offs_m2 < Q_LEN)
+        lse = tl.where(lse == -float("inf"), 0.0, lse)
+        lse = lse[:, None]
+
+        # ~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        # KV_IDX and KV_NUM_BLKS are always contiguous.
+        kv_indices = KV_IDX + sparse_kv_idx_offset
+        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+        sparse_kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
+
+        offs_n2 = kv_start + tl.arange(0, BLOCK_N2)
+        dq = bwd_dq_inner(
+            {{gen_argdefs()}},
+            K, V,
+            dq, q, do, Di, lse,
+            off_z, off_hq2, offs_m2, offs_n2,
+            stride_kn, stride_kd, stride_vn, stride_vd,
+            kv_indices, sparse_kv_num_blocks,
+            MATMUL_PRECISION,
+            IS_FULL_BLOCKS=False,
+        )
+
+        if HAS_FULL_BLOCKS:
+            # ~~~~~~~~~~~ partial unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
+            kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
+            kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+            sparse_kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
+
+            offs_n2 = kv_start + tl.arange(0, BLOCK_N2)
+            dq = bwd_dq_inner(
+                {{gen_argdefs()}},
+                K, V,
+                dq, q, do, Di, lse,
+                off_z, off_hq2, offs_m2, offs_n2,
+                stride_kn, stride_kd, stride_vn, stride_vd,
+                kv_indices, sparse_kv_num_blocks,
+                MATMUL_PRECISION,
+                IS_FULL_BLOCKS=True,
+            )
+
+        # Write back dQ.
+        dq_ptrs = DQ2 + offs_m2[:, None] * stride_dqm + offs_k[None, :] * stride_dqd
+        dq *= SM_SCALE
+        if IS_DIVISIBLE:
+            tl.store(dq_ptrs, dq)
+        else:
+            tl.store(dq_ptrs, dq, mask=offs_m2[:, None] < Q_LEN)
+    else:
+        # THIS BLOCK DOES DK & DV
+        SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1)
+        SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N1)
+
+        pid_mask = pid // SPARSE_KV_MULTIPLE
+
+        stride_q_num_blks_h = {{stride("Q_NUM_BLKS", 1)}}
+        stride_q_idx_h = {{stride("Q_IDX", 1)}}
+        stride_q_idx_n = {{stride("Q_IDX", 2)}}
+
+        dv = tl.zeros([BLOCK_N1, V_HEAD_DIM], dtype=tl.float32)
+        dk = tl.zeros([BLOCK_N1, QK_HEAD_DIM], dtype=tl.float32)
+
+        start_n1 = pid * BLOCK_N1
+        offs_n1 = start_n1 + tl.arange(0, BLOCK_N1)
+
+        # load K and V: they stay in SRAM throughout the inner loop.
+        if IS_DIVISIBLE:
+            k = tl.load(K + offs_n1[:, None] * stride_kn + offs_k[None, :] * stride_kd)
+            v = tl.load(V + offs_n1[:, None] * stride_vn + offs_v[None, :] * stride_vd)
+        else:
+            k = tl.load(K + offs_n1[:, None] * stride_kn + offs_k[None, :] * stride_kd, mask=offs_n1[:, None] < KV_LEN)
+            v = tl.load(V + offs_n1[:, None] * stride_vn + offs_v[None, :] * stride_vd, mask=offs_n1[:, None] < KV_LEN)
+        if PRESCALE_QK:
+            k = (k * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+
+        for off_g in range(0, GQA_SHARED_HEADS):
+            off_hq1 = off_hkv * GQA_SHARED_HEADS + off_g
+
+            # Offset Q, DQ, DO, DELTA & LSE. These inputs are offseted by query heads.
+            q_adj1 = (stride_qh * off_hq1 + stride_qz * off_z).to(tl.int64)
+            do_adj1 = (stride_doh * off_hq1 + stride_doz * off_z).to(tl.int64)
+            dq_adj1 = (stride_dqh * off_hq1 + stride_dqz * off_z).to(tl.int64)
+            off_chz1 = ((off_z * HQ + off_hq1) * Q_LEN).to(tl.int64)
+
+            Q1 = Q + q_adj1
+            DO1 = DO + do_adj1
+            # TODO: This does not work if DQ is not the same layout as Q (for example,
+            # if Q is broadcasted)
+            LSE1 = LSE + off_chz1
+            DELTA1 = DELTA + off_chz1
+
+            sparse_idx_hq1 = off_hq1 % SPARSE_HQ
+            sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq1
+
+            sparse_q_num_blks_offset = sparse_hz_offset * stride_q_num_blks_h + pid_mask
+            sparse_q_idx_offset = sparse_hz_offset * stride_q_idx_h + pid_mask * stride_q_idx_n  # noqa: B950
+
+            # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # Q_IDX and Q_NUM_BLKS are always contiguous.
+            q_indices = Q_IDX + sparse_q_idx_offset
+            q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading
+            sparse_q_num_blocks = tl.load(Q_NUM_BLKS + sparse_q_num_blks_offset)
+
+            offs_m1 = q_start + tl.arange(0, BLOCK_M1)
+            dk, dv = bwd_dkdv_inner(
+                {{gen_argdefs()}},
+                Q1, DO1, DELTA1, LSE1,
+                dk, dv, k, v,
+                off_z, off_hq1, offs_n1, offs_m1,
+                stride_qm, stride_qd, stride_dom, stride_dod,
+                q_indices, sparse_q_num_blocks,
+                MATMUL_PRECISION,
+                IS_FULL_BLOCKS=False,
+            )
+
+
+            if HAS_FULL_BLOCKS:
+                # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                # FULL_Q_IDX and FULL_Q_NUM_BLKS are always contiguous.
+                q_indices = FULL_Q_IDX + sparse_q_idx_offset
+                q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading
+                sparse_q_num_blocks = tl.load(FULL_Q_NUM_BLKS + sparse_q_num_blks_offset)
+
+                offs_m1 = q_start + tl.arange(0, BLOCK_M1)
+                dk, dv = bwd_dkdv_inner(
+                    {{gen_argdefs()}},
+                    Q1, DO1, DELTA1, LSE1,
+                    dk, dv, k, v,
+                    off_z, off_hq1, offs_n1, offs_m1,
+                    stride_qm, stride_qd, stride_dom, stride_dod,
+                    q_indices, sparse_q_num_blocks,
+                    MATMUL_PRECISION,
+                    IS_FULL_BLOCKS=True,
+                )
+
+        # Write back dV and dK.
+        dv_ptrs = DV + offs_n1[:, None] * stride_dvm + offs_v[None, :] * stride_dvd
+
+        index_n = offs_n1[:, None]
+        index_k = offs_k[None, :]
+
+        if IS_DIVISIBLE:
+            tl.store(dv_ptrs, dv)
+        else:
+            tl.store(dv_ptrs, dv, mask=index_n < KV_LEN)
+
+        dk *= SM_SCALE
+        mask = index_n < KV_LEN
+        {{store_output(("off_z", "off_hkv", "index_n", "index_k"), "dk", "mask", indent_width=8)}}
+
+@triton.jit
+def bwd_dq_inner(
+    {{gen_argdefs()}},
+    K, V,  # pointers
+    dq, q, do, Di, lse,
+    off_z, off_hq, offs_m2, offs_n2,
+    stride_kn, stride_kd, stride_vn, stride_vd,
+    kv_indices, sparse_kv_num_blocks,
+    MATMUL_PRECISION,
+    IS_FULL_BLOCKS,
+):
+    {{gen_defines() | indent_except_first(1) }}
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2)
+    RCP_LN2: tl.constexpr = 1.44269504
+    Q_LEN = {{size("Q", 2)}}
+    KV_LEN = {{size("K", 2)}}
+
+    offs_k = tl.arange(0, QK_HEAD_DIM)
+    offs_v = tl.arange(0, V_HEAD_DIM)
+
+    kT_ptrs = K + offs_n2[None, :] * stride_kn + offs_k[:, None] * stride_kd
+    vT_ptrs = V + offs_n2[None, :] * stride_vn + offs_v[:, None] * stride_vd
+    # BLOCK_M2 must be a multiple of BLOCK_N2, otherwise the code wouldn't work.
+    tl.static_assert(BLOCK_M2 % BLOCK_N2 == 0)
+
+    hi = tl.minimum(sparse_kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N2), 1))
+    if not IS_DIVISIBLE:
+        if hi >= 1:
+            for start_n in range(0, hi - 1):
+                dq = bwd_dq_block_mn(
+                    {{gen_argdefs()}},
+                    dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+                    off_z, off_hq, offs_m2, offs_n2,
+                    stride_kn, stride_kd, stride_vn, stride_vd,
+                    kv_indices, sparse_kv_num_blocks,
+                    MATMUL_PRECISION, RCP_LN2,
+                    IS_FULL_BLOCKS,
+                )
+
+                # Increment pointers.
+                offset = get_offset_for_next_block(
+                    start_n, kv_indices, sparse_kv_num_blocks,
+                    SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2
+                )
+
+                kT_ptrs += offset * stride_kn
+                vT_ptrs += offset * stride_vn
+
+                offs_n2 += offset
+
+            dq = bwd_dq_block_mn(
+                {{gen_argdefs()}},
+                dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+                off_z, off_hq, offs_m2, offs_n2,
+                stride_kn, stride_kd, stride_vn, stride_vd,
+                kv_indices, sparse_kv_num_blocks,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
+            )
+    else:
+        for start_n in range(0, hi):
+            dq = bwd_dq_block_mn(
+                {{gen_argdefs()}},
+                dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+                off_z, off_hq, offs_m2, offs_n2,
+                stride_kn, stride_kd, stride_vn, stride_vd,
+                kv_indices, sparse_kv_num_blocks,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS,
+            )
+
+            # Increment pointers.
+            offset = get_offset_for_next_block(
+                start_n, kv_indices, sparse_kv_num_blocks,
+                SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2
+            )
+
+            kT_ptrs += offset * stride_kn
+            vT_ptrs += offset * stride_vn
+
+            offs_n2 += offset
+
+    return dq
+
+
+@triton.jit
+def bwd_dq_block_mn(
+    {{gen_argdefs()}},
+    dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+    off_z, off_hq, offs_m2, offs_n2,
+    stride_kn, stride_kd, stride_vn, stride_vd,
+    kv_indices, sparse_kv_num_blocks,
+    MATMUL_PRECISION, RCP_LN2,
+    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+):
+    {{gen_defines() | indent_except_first(1)}}
+
+    if IS_DIVISIBLE:
+        kT = tl.load(kT_ptrs)
+    else:
+        kT = tl.load(kT_ptrs, mask=offs_n2[None, :] < KV_LEN)
+    qk = tl.dot(q, kT, input_precision=FLOAT32_PRECISION)
+    if not PRESCALE_QK:
+        qk *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    pre_mod_scores = qk
+    if CHECK_BLOCK_BOUNDARY:
+        m = offs_m2[:, None] % Q_LEN
+        n = offs_n2[None, :] % KV_LEN
+    else:
+        m = offs_m2[:, None]
+        n = offs_n2[None, :]
+    {{ modification(
+        subgraph_number=0,
+        output_name="post_mod_scores",
+        score="qk",
+        b="off_z",
+        h="off_hq",
+        m="m",
+        n="n",
+        out="qk"
+    ) | indent_except_first(1) }}
+
+    if CHECK_BLOCK_BOUNDARY:
+        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+        post_mod_scores = tl.where(offs_n2[None, :] < KV_LEN, post_mod_scores, float("-inf"))
+
+    if not IS_FULL_BLOCKS:
+        {{ modification(
+            subgraph_number=2,
+            output_name="mask_mod_output",
+            score="qk",
+            b="off_z",
+            h="off_hq",
+            m="m",
+            n="n",
+        ) | indent_except_first(2) }}
+
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n2[None, :] < KV_LEN, mask_mod_output, float("-inf"))
+        # apply mask for partial masked block
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    p = tl.math.exp2(post_mod_scores - lse)
+    # Compute dP and dS.
+    if IS_DIVISIBLE:
+        vT = tl.load(vT_ptrs)
+    else:
+        vT = tl.load(vT_ptrs, mask=offs_n2[None, :] < KV_LEN)
+    dp = tl.dot(do, vT, input_precision=FLOAT32_PRECISION)
+    ds = p * (dp - Di[:, None])
+    # ~~~~~~~~~~~~~~~~~~~ Apply joint modification  ~~~~~~~~~~~~~~~~~~~
+    {{ modification(
+        subgraph_number=1,
+        output_name = "grad_scores",
+        score="pre_mod_scores",
+        b="off_z",
+        h="off_hq",
+        m="m",
+        n="n",
+        grad_score_mod="ds"
+    ) | indent_except_first(1) }}
+    if CHECK_BLOCK_BOUNDARY:
+        grad_scores = tl.where(offs_n2[None, :] < KV_LEN, grad_scores, 0.0)
+
+    ds = grad_scores
+
+    if not IS_FULL_BLOCKS:
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n2[None, :] < KV_LEN, mask_mod_output, float("-inf"))
+        # (grads) apply mask for partially unmasked block
+        ds = tl.where(mask_mod_output, ds, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ds = ds.to(MATMUL_PRECISION)
+    # Compute dQ.
+    dq += tl.dot(ds, tl.trans(kT), input_precision=FLOAT32_PRECISION)
+
+    return dq
+
+
+@triton.jit
+def bwd_dkdv_inner(
+    {{gen_argdefs()}},
+    Q, DO, DELTA, LSE, # pointers
+    dk, dv, k, v,
+    off_z, off_hq, offs_n1, offs_m1,
+    stride_qm, stride_qd, stride_dom, stride_dod,
+    q_indices, sparse_q_num_blocks,
+    MATMUL_PRECISION,
+    IS_FULL_BLOCKS,
+):
+    {{gen_defines() | indent_except_first(1) }}
+    SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1)
+    RCP_LN2: tl.constexpr = 1.44269504
+    Q_LEN = {{size("Q", 2)}}
+    KV_LEN = {{size("K", 2)}}
+
+    offs_k = tl.arange(0, QK_HEAD_DIM)
+    offs_v = tl.arange(0, V_HEAD_DIM)
+
+    qT_ptrs = Q + offs_m1[None, :] * stride_qm + offs_k[:, None] * stride_qd
+    do_ptrs = DO + offs_m1[:, None] * stride_dom + offs_v[None, :] * stride_dod
+    # BLOCK_N1 must be a multiple of BLOCK_M1, otherwise the code wouldn't work.
+    tl.static_assert(BLOCK_N1 % BLOCK_M1 == 0)
+    hi = tl.minimum(sparse_q_num_blocks * SPARSE_Q_MULTIPLE, tl.maximum(tl.cdiv(Q_LEN, BLOCK_M1), 1))
+
+    if not IS_DIVISIBLE:
+        if hi >= 1:
+            for start_m in range(0, hi - 1):
+                dk, dv = bwd_dkdv_block_mn(
+                    {{gen_argdefs()}},
+                    dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+                    off_z, off_hq, offs_n1, offs_m1,
+                    stride_qm, stride_qd, stride_dom, stride_dod,
+                    q_indices, sparse_q_num_blocks,
+                    MATMUL_PRECISION, RCP_LN2,
+                    IS_FULL_BLOCKS,
+                )
+                # Increment pointers.
+                offset = get_offset_for_next_block(
+                    start_m, q_indices, sparse_q_num_blocks,
+                    SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1
+                )
+
+                qT_ptrs += offset * stride_qm
+                do_ptrs += offset * stride_dom
+
+                offs_m1 += offset
+
+            dk, dv = bwd_dkdv_block_mn(
+                {{gen_argdefs()}},
+                dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+                off_z, off_hq, offs_n1, offs_m1,
+                stride_qm, stride_qd, stride_dom, stride_dod,
+                q_indices, sparse_q_num_blocks,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
+            )
+    else:
+        for start_m in range(0, hi):
+            dk, dv = bwd_dkdv_block_mn(
+                {{gen_argdefs()}},
+                dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+                off_z, off_hq, offs_n1, offs_m1,
+                stride_qm, stride_qd, stride_dom, stride_dod,
+                q_indices, sparse_q_num_blocks,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS,
+            )
+            # Increment pointers.
+            offset = get_offset_for_next_block(
+                start_m, q_indices, sparse_q_num_blocks,
+                SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1
+            )
+
+            qT_ptrs += offset * stride_qm
+            do_ptrs += offset * stride_dom
+
+            offs_m1 += offset
+
+    return dk, dv
+
+
+@triton.jit
+def bwd_dkdv_block_mn(
+    {{gen_argdefs()}},
+    dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+    off_z, off_hq, offs_n1, offs_m1,
+    stride_qm, stride_qd, stride_dom, stride_dod,
+    q_indices, sparse_q_num_blocks,
+    MATMUL_PRECISION, RCP_LN2,
+    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+):
+    {{gen_defines() | indent_except_first(1) }}
+
+    # Load LSE before computing qk to reduce pipeline stall.
+    if IS_DIVISIBLE:
+        qT = tl.load(qT_ptrs)
+        lse = tl.load(LSE + offs_m1)
+    else:
+        qT = tl.load(qT_ptrs, mask=offs_m1[None, :] < Q_LEN)
+        lse = tl.load(LSE + offs_m1, mask=offs_m1 < Q_LEN)
+    lse = tl.where(lse == -float("inf"), 0.0, lse)
+    qkT = tl.dot(k, qT, input_precision=FLOAT32_PRECISION)
+    if not PRESCALE_QK:
+        qkT *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    if CHECK_BLOCK_BOUNDARY:
+        m = offs_m1[None, :] % Q_LEN
+        n = offs_n1[:, None] % KV_LEN
+    else:
+        m = offs_m1[None, :]
+        n = offs_n1[:, None]
+    pre_mod_scores = qkT
+    {{ modification(
+        subgraph_number=0,
+        output_name="post_mod_scores",
+        score="qkT",
+        b="off_z",
+        h="off_hq",
+        m="m",
+        n="n",
+        out="qkT"
+    ) | indent_except_first(1) }}
+
+    if CHECK_BLOCK_BOUNDARY:
+        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+        post_mod_scores = tl.where(offs_n1[:, None] < KV_LEN, post_mod_scores, float("-inf"))
+
+    if not IS_FULL_BLOCKS:
+        {{ modification(
+            subgraph_number=2,
+            output_name="mask_mod_output",
+            score="qkT",
+            b="off_z",
+            h="off_hq",
+            m="m",
+            n="n",
+        ) | indent_except_first(2) }}
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n1[:, None] < KV_LEN, mask_mod_output, float("-inf"))
+        # (grads) apply mask for fully masked block
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    pT = tl.math.exp2(post_mod_scores - lse[None, :])
+    if IS_DIVISIBLE:
+        do = tl.load(do_ptrs)
+    else:
+        do = tl.load(do_ptrs, mask=offs_m1[:, None] < Q_LEN)
+    # Compute dV.
+    ppT = pT
+    dv += tl.dot(ppT.to(MATMUL_PRECISION), do, input_precision=FLOAT32_PRECISION)
+    if IS_DIVISIBLE:
+        Di = tl.load(DELTA + offs_m1)
+    else:
+        Di = tl.load(DELTA + offs_m1, mask=offs_m1 < Q_LEN)
+    # Compute dP and dS.
+    dpT = tl.dot(v, tl.trans(do), input_precision=FLOAT32_PRECISION)
+    dsT = pT * (dpT - Di[None, :])
+    # ~~~~~~~~~~~~~~~~~~~ Apply joint modification  ~~~~~~~~~~~~~~~~~~~
+    {{ modification(
+        subgraph_number=1,
+        output_name = "grad_scores",
+        score="pre_mod_scores",
+        b="off_z",
+        h="off_hq",
+        m="m",
+        n="n",
+        grad_score_mod="dsT"
+    ) | indent_except_first(1) }}
+    if CHECK_BLOCK_BOUNDARY:
+        grad_scores = tl.where(offs_n1[:, None] < KV_LEN, grad_scores, 0.0)
+
+    dsT = grad_scores
+    if not IS_FULL_BLOCKS:
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n1[:, None] < KV_LEN, mask_mod_output, float("-inf"))
+        # (grads) apply mask for partially unmasked block
+        dsT = tl.where(mask_mod_output, dsT, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    dk += tl.dot(dsT.to(MATMUL_PRECISION), tl.trans(qT), input_precision=FLOAT32_PRECISION)
+
+    return dk, dv
+ """
+    + compute_next_offset_func,
+)
+
+
+# TODO: We probably also need a layout constraint?
+@register_lowering(
+    torch.ops.higher_order.flex_attention_backward, type_promotion_kind=None
+)
+def flex_attention_backward(*args, **kwargs):
+    (
+        query,
+        key,
+        value,
+        out,
+        logsumexp,
+        grad_out,
+        grad_logsumexp,
+        fw_graph,
+        joint_graph,
+        block_mask,
+        scale,
+        kernel_options,
+        score_mod_other_buffers,
+        mask_mod_other_buffers,
+    ) = args
+    (
+        kv_num_blocks,
+        kv_indices,
+        full_kv_num_blocks,
+        full_kv_indices,
+        q_num_blocks,
+        q_indices,
+        full_q_num_blocks,
+        full_q_indices,
+        SPARSE_KV_BLOCK_SIZE,
+        SPARSE_Q_BLOCK_SIZE,
+        mask_graph,
+    ) = block_mask
+
+    (
+        query,
+        key,
+        value,
+        grad_out,
+        kv_num_blocks,
+        kv_indices,
+        full_kv_num_blocks,
+        full_kv_indices,
+        q_num_blocks,
+        q_indices,
+        full_q_num_blocks,
+        full_q_indices,
+    ) = maybe_realize(
+        [
+            query,
+            key,
+            value,
+            grad_out,
+            kv_num_blocks,
+            kv_indices,
+            full_kv_num_blocks,
+            full_kv_indices,
+            q_num_blocks,
+            q_indices,
+            full_q_num_blocks,
+            full_q_indices,
+        ]
+    )
+
+    device = query.get_device()
+    dtype = query.get_dtype()
+    Bq, Hq, seq_len_q, qk_head_dim = query.get_size()
+    Bkv, Hkv, seq_len_kv, v_head_dim = value.get_size()
+    assert Bq == Bkv, "Batch dimension must match"
+    B = Bq
+
+    kernel_options = dict(kernel_options)
+    kernel_options.setdefault("FLOAT32_PRECISION", get_float32_precision())
+    if seq_len_q % 128 != 0 or seq_len_kv % 128 != 0:
+        kernel_options.setdefault("IS_DIVISIBLE", False)
+    else:
+        kernel_options.setdefault("IS_DIVISIBLE", True)
+
+    fwd_placeholder_inps = [
+        create_placeholder(name, dtype, device)
+        for name, dtype in [
+            ("score", dtype),
+            ("b", torch.int32),
+            ("h", torch.int32),
+            ("m", torch.int32),
+            ("n", torch.int32),
+        ]
+    ]
+    fw_subgraph_buffer = build_subgraph_buffer(
+        fwd_placeholder_inps + list(score_mod_other_buffers), fw_graph
+    )
+
+    joint_placeholder_inps = fwd_placeholder_inps + [
+        create_placeholder("grad_score_mod", dtype, device)
+    ]
+    joint_subgraph_buffer, *_ = build_subgraph_buffer(
+        joint_placeholder_inps + list(score_mod_other_buffers), joint_graph
+    )
+
+    mask_graph_placeholder_inps = [
+        create_placeholder(name, dtype, query.get_device())
+        for name, dtype in [
+            ("b", torch.int32),
+            ("h", torch.int32),
+            ("m", torch.int32),
+            ("n", torch.int32),
+        ]
+    ]
+    mask_graph_buffer = build_subgraph_buffer(
+        mask_graph_placeholder_inps + list(mask_mod_other_buffers), mask_graph
+    )
+
+    layout_k = FixedLayout(
+        key.get_device(),
+        key.get_dtype(),
+        key.get_size(),
+        key.get_stride(),
+    )
+
+    # Create delta which will is needed for the bwd's kernel
+    grad_lse_exp2 = lowerings[aten.mul](grad_logsumexp, 1 / math.log(2))
+    mul_delta = lowerings[aten.mul](out, grad_out)
+    delta = lowerings[aten.sum](mul_delta, axis=-1)
+    delta = lowerings[aten.sub](delta, grad_lse_exp2)
+    delta = ExternKernel.require_contiguous(delta)
+
+    grad_lse_exp2, delta = maybe_realize([grad_lse_exp2, delta])
+
+    # see NOTE:[TritonTemplates with multiple outputs]
+    grad_query = empty_strided(
+        query.get_size(), query.get_stride(), dtype=dtype, device=device
+    )
+    grad_value = empty_strided(
+        value.get_size(), value.get_stride(), dtype=dtype, device=device
+    )
+
+    kernel_options.setdefault("SM_SCALE", scale)
+
+    # Determine GQA factor
+    gqa_shared_heads = Hq // Hkv
+    kernel_options.setdefault("GQA_SHARED_HEADS", gqa_shared_heads)
+
+    # Inside of Triton kernel, only apply partial masking if partial blocks are computed.
+    # full_kv_num_blocks is torch.zeros([1, 1, 1]) if partial blocks are not computed.
+    has_full_blocks = full_kv_num_blocks is not None
+    kernel_options.setdefault("HAS_FULL_BLOCKS", has_full_blocks)
+    if not has_full_blocks:
+        full_kv_num_blocks, full_kv_indices, full_q_num_blocks, full_q_indices = (
+            empty(0, device=query.get_device()) for _ in range(4)
+        )
+    kernel_options.setdefault("QK_HEAD_DIM", qk_head_dim)
+    kernel_options.setdefault("V_HEAD_DIM", v_head_dim)
+
+    choices: List[Any] = []
+    configs: List[Tuple[int, int, int, int]] = []
+    configs.append(_get_default_config_bwd(query))
+    if config.max_autotune:
+        configs.extend(
+            [
+                (BLOCK1, BLOCK2, w, s)
+                for BLOCK1 in [32, 64]
+                for BLOCK2 in [32, 64, 128]
+                for w in [4, 8]
+                for s in [1, 3, 4, 5]
+                if BLOCK2 % BLOCK1 == 0
+            ]
+        )
+
+    for BLOCK1, BLOCK2, num_warps, num_stages in configs:
+        if (
+            SPARSE_KV_BLOCK_SIZE % BLOCK1 != 0
+            or SPARSE_Q_BLOCK_SIZE % BLOCK1 != 0
+            or SPARSE_KV_BLOCK_SIZE % BLOCK2 != 0
+            or SPARSE_Q_BLOCK_SIZE % BLOCK2 != 0
+        ):
+            continue
+
+        # Performance tuning
+        kernel_options.setdefault("BLOCK_M1", BLOCK1)
+        kernel_options.setdefault("BLOCK_N1", BLOCK2)
+        kernel_options.setdefault("BLOCK_M2", BLOCK2)
+        kernel_options.setdefault("BLOCK_N2", BLOCK1)
+        # Blocksparse options
+        kernel_options.setdefault("SPARSE_Q_BLOCK_SIZE", SPARSE_Q_BLOCK_SIZE)
+        kernel_options.setdefault("SPARSE_KV_BLOCK_SIZE", SPARSE_KV_BLOCK_SIZE)
+
+        flex_attention_backward_template.maybe_append_choice(
+            choices=choices,
+            input_nodes=[
+                query,
+                key,
+                value,
+                logsumexp,
+                delta,
+                grad_out,
+                grad_query,
+                grad_value,
+                kv_num_blocks,
+                kv_indices,
+                q_num_blocks,
+                q_indices,
+                full_kv_num_blocks,
+                full_kv_indices,
+                full_q_num_blocks,
+                full_q_indices,
+            ],
+            layout=layout_k,  # We use store_output only for grad_key
+            subgraphs=[fw_subgraph_buffer, joint_subgraph_buffer, mask_graph_buffer],
+            mutated_inputs=[grad_query, grad_value],
+            call_sizes=query.get_size() + key.get_size()[1:3],
+            num_stages=num_stages,
+            num_warps=num_warps,
+            **kernel_options,
+        )
+    inputs_for_autotuning = (
+        [
+            query,
+            key,
+            value,
+            logsumexp,
+            delta,
+            grad_out,
+            grad_query,
+            grad_value,
+            kv_num_blocks,
+            kv_indices,
+            q_num_blocks,
+            q_indices,
+            full_kv_num_blocks,
+            full_kv_indices,
+            full_q_num_blocks,
+            full_q_indices,
+        ]
+        + list(score_mod_other_buffers)
+        + list(mask_mod_other_buffers)
+    )
+    input_gen_fns = {
+        8: create_num_blocks_fake_generator(kv_indices),  # kv_num_blocks
+        9: create_indices_fake,
+        10: create_num_blocks_fake_generator(q_indices),  # q_num_blocks
+        11: create_indices_fake,
+        12: create_num_blocks_fake_generator(full_kv_indices),  # full_kv_num_blocks
+        13: create_indices_fake,
+        14: create_num_blocks_fake_generator(full_q_indices),  # full_q_num_blocks
+        15: create_indices_fake,
+    }
+
+    grad_key = autotune_select_algorithm(
+        "flex_attention_backward",
+        choices,
+        inputs_for_autotuning,
+        layout_k,
+        input_gen_fns=input_gen_fns,
+    )
+    return (
+        grad_query,
+        grad_key,
+        grad_value,
+    )
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/flex_decoding.py b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/flex_decoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a5b7c60eafddacfadf4ee25e7e3c943fb48697f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/flex_decoding.py
@@ -0,0 +1,570 @@
+# mypy: allow-untyped-defs
+""" Triton Implementation of the flex_attention Kernel for short query length (FlexDecoding)"""
+from typing import Any, List, Tuple
+
+import sympy
+
+import torch
+from torch._inductor.virtualized import V
+
+from .. import config, ir
+from ..ir import FixedLayout, FlexibleLayout
+from ..lowering import empty, empty_strided, lowerings
+from ..runtime.runtime_utils import is_power_of_2, next_power_of_2
+from ..select_algorithm import autotune_select_algorithm, TritonTemplate
+from .flex_attention import (
+    compute_forward_block_mn,
+    compute_forward_inner,
+    compute_next_offset_func,
+    create_indices_fake,
+    create_num_blocks_fake_generator,
+    maybe_realize,
+)
+
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+
+def flex_decoding_grid(batch_size, kv_heads, gqa_group_size, n_keys, d_model, meta):
+    """How is this kernel parallelized?
+    We create a grid of (batch_size * kv_heads, SPLIT_KV, 1)
+    Each block is responsible for iterating over blocks of keys and values calculating
+    the local output for their tile of keys and values over all full length of query.
+    groups of SPLIT_KV blocks then combine their output to produce the final result.
+    """
+
+    return (batch_size * kv_heads, meta["SPLIT_KV"], 1)
+
+
+flex_decoding_template = TritonTemplate(
+    name="flex_decoding",
+    grid=flex_decoding_grid,
+    source=r"""
+    {{def_kernel("Q", "K", "V", "M", "L", "KV_NUM_BLKS", "KV_IDX", "FULL_KV_NUM_BLKS", "FULL_KV_IDX")}}
+    # Sub notation for this kernel:
+    # Q: Query, K: Key, V: Value
+    # reduction buffers: M rowmax across local KV split, L local sumexp across local KV split
+    # M: Number of queries, N: Number of keys/values
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # BLOCK_M, QK_HEAD_DIM: M, and D dimemsion are always assigned to the same block
+    # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head t: Number of kv splits
+    # (Modifiable) Config options:
+    # SPLIT_KV: number of blocks K & V are split into
+    # TILE_KV: length of each local KV split
+    # BLOCK_M: block size that Q is padded along seqlen dim.
+    # BLOCK_N: block size of K & V along N dimension.
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    #
+    # change of base out of the loop
+    # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row
+    # is not masked out? If so, we can skip an extra safety check
+    # SAFE_M_BOUNDARY: Is Q seqlen a multiple of BLOCK_M? If so, we can skip an extra boundary check for loading query.
+    # SAFE_N_BOUNDARY: Is KV seqlen a multiple of BLOCK_N? If so, we can skip an extra boundary check for loading key/value.
+
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base.
+    #
+    # SPARSE_KV_BLOCK_SIZE: sparse mask block size along KV seqlen dim.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    #
+    #
+    # Output: ACC output accumulated across local KV split.
+
+    tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0)
+
+    # Define Q Strides
+    stride_qz, stride_qh, stride_qg, stride_qm, stride_qk = {{stride("Q")}}
+    stride_kz, stride_kh, stride_kn, stride_kk = {{stride("K")}}
+    stride_vz, stride_vh, stride_vn, stride_vk = {{stride("V")}}
+    stride_mz, stride_mt, stride_mh, stride_mm = {{stride("M")}}
+    stride_lz, stride_lt, stride_lh, stride_lm = {{stride("L")}}
+
+
+    Z = {{size("Q", 0)}}
+    HKV = {{size("Q", 1)}}
+    G: tl.constexpr = GQA_SHARED_HEADS
+    HQ = HKV * G
+    Q_LEN = {{size("Q", 3)}}
+    KV_LEN = {{size("K", 2)}}
+
+    MATMUL_PRECISION = Q.dtype.element_ty
+
+    # Make sure each split is a multiple of BLOCK_N
+    TILE_KV_OG = tl.cdiv(KV_LEN, SPLIT_KV)
+    TILE_KV = tl.cdiv(TILE_KV_OG, BLOCK_N) * BLOCK_N
+    TILE_KV_MULTIPLE: tl.constexpr = (TILE_KV // BLOCK_N)
+
+    off_z = tl.program_id(0) // HKV
+    off_hkv = tl.program_id(0) % HKV
+    off_t = tl.program_id(1)
+
+    q_offset = off_z * stride_qz + off_hkv * stride_qh
+    k_offset = off_z * stride_kz + off_hkv * stride_kh
+    v_offset = off_z * stride_vz + off_hkv * stride_vh
+
+    SPARSE_Z = {{size("KV_NUM_BLKS", 0)}}
+    SPARSE_HQ = {{size("KV_NUM_BLKS", 1)}}
+
+    sparse_idx_z = off_z % SPARSE_Z
+    # TODO: support masks not broadcasted along the head dimension.
+    tl.device_assert(SPARSE_HQ == 1)
+    sparse_idx_h = 0
+
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    SPARSE_KV_BLOCK_CNT = tl.cdiv(KV_LEN, SPARSE_KV_BLOCK_SIZE)
+
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, V_HEAD_DIM], dtype=tl.float32)
+
+    # initialize offsets
+    tl.device_assert(BLOCK_M % G == 0)
+    BLOCK_M_PER_HQ: tl.constexpr = BLOCK_M // G
+    off_g = tl.arange(0, G)                                                 # [G]
+    offs_g = tl.ravel(tl.broadcast_to(off_g[:, None], [G, BLOCK_M_PER_HQ])) # [BLOCK_M]
+    offs_hq = offs_g + off_hkv * G
+    off_m = tl.arange(0, BLOCK_M_PER_HQ)                                    # [BLOCK_M_PER_HQ]
+    offs_m = tl.ravel(tl.broadcast_to(off_m[None, :], [G, BLOCK_M_PER_HQ])) # [BLOCK_M]
+    offs_d = tl.arange(0, QK_HEAD_DIM)
+    offs_vd = tl.arange(0, V_HEAD_DIM)
+
+    # KV_IDX / FULL_KV_IDX and KV_NUM_BLKS / FULL_KV_NUM_BLKS are always contiguous.
+    sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_h
+
+    # Calculate KV blocks that belong this CTA.
+    block_n_start = off_t * TILE_KV_MULTIPLE                        # n_offset inside sparse block
+    block_n_end = block_n_start + TILE_KV_MULTIPLE                  # end BLOCK_N
+
+    q_range = stride_qg * off_g[:, None, None] + stride_qm * off_m[None, :, None] + stride_qk * offs_d[None, None, :]
+
+    if SAFE_M_BOUNDARY:
+        q = tl.load(Q + q_offset + q_range)
+    else:
+        mask = off_m[None, :, None] < Q_LEN
+        q = tl.load(Q + q_offset + q_range, mask)
+
+    q = tl.reshape(q, [BLOCK_M, QK_HEAD_DIM])
+
+
+    # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # Apply both score_mod and mask_mod
+
+    # find first kv block we are loading and the number of blocks we are loading
+    kv_indices = KV_IDX + sparse_hz_offset * SPARSE_KV_BLOCK_CNT
+    kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_hz_offset)
+    indices_idx = block_n_start // SPARSE_KV_MULTIPLE
+    off_n_block_in_sparse = block_n_start % SPARSE_KV_MULTIPLE
+    off_n = tl.load(kv_indices + indices_idx) * SPARSE_KV_BLOCK_SIZE + off_n_block_in_sparse * BLOCK_N
+    # first kv block we're loading
+
+    # last valid block according to sparse mask
+    block_n_last_valid = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+
+    K_block_ptr = tl.make_block_ptr(
+        base=K + k_offset,
+        shape=(QK_HEAD_DIM, KV_LEN),                # (d, N)
+        strides=(stride_kk, stride_kn),
+        offsets=(0, off_n),
+        block_shape=(QK_HEAD_DIM, BLOCK_N),
+        order=(0, 1)
+    )
+    V_block_ptr = tl.make_block_ptr(
+        base=V + v_offset,
+        shape=(KV_LEN, V_HEAD_DIM),
+        strides=(stride_vn, stride_vk),
+        offsets=(off_n, 0),
+        block_shape=(BLOCK_N, V_HEAD_DIM),
+        order=(1, 0)
+    )
+    offs_n = tl.arange(0, BLOCK_N) + off_n
+
+    acc, l_i, m_i = forward_inner(
+        {{gen_argdefs()}},
+        q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+        # accumulatd values
+        acc, l_i, m_i,
+        #offsets
+        off_z, offs_hq[:, None], offs_m[:, None], offs_n[None, :],
+        #block sparse data
+        kv_indices, kv_num_blocks,
+        block_n_start, block_n_end if block_n_end <= block_n_last_valid else block_n_last_valid,
+        MATMUL_PRECISION,
+        IS_FULL_BLOCKS=False,
+    )
+
+
+    # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We know these blocks are guaranteed to be "full", so we don't need to
+    # apply mask_mod to them - only score_mod
+    if HAS_FULL_BLOCKS:
+        kv_indices = FULL_KV_IDX + sparse_hz_offset * SPARSE_KV_BLOCK_CNT
+        kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_hz_offset)
+        indices_idx = block_n_start // SPARSE_KV_MULTIPLE
+        off_n_block_in_sparse = block_n_start % SPARSE_KV_MULTIPLE
+        off_n = tl.load(kv_indices + indices_idx) * SPARSE_KV_BLOCK_SIZE + off_n_block_in_sparse * BLOCK_N
+
+        # last valid block according to sparse mask
+        block_n_last_valid = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+
+        K_block_ptr = tl.make_block_ptr(
+            base=K + k_offset,
+            shape=(QK_HEAD_DIM, KV_LEN),                # (d, N)
+            strides=(stride_kk, stride_kn),
+            offsets=(0, off_n),
+            block_shape=(QK_HEAD_DIM, BLOCK_N),
+            order=(0, 1)
+        )
+        V_block_ptr = tl.make_block_ptr(
+            base=V + v_offset,
+            shape=(KV_LEN, V_HEAD_DIM),
+            strides=(stride_vn, stride_vk),
+            offsets=(off_n, 0),
+            block_shape=(BLOCK_N, V_HEAD_DIM),
+            order=(1, 0)
+        )
+        offs_n = tl.arange(0, BLOCK_N) + off_n
+
+        acc, l_i, m_i = forward_inner(
+            {{gen_argdefs()}},
+            q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+            # accumulatd values
+            acc, l_i, m_i,
+            #offsets
+            off_z, offs_hq[:, None], offs_m[:, None], offs_n[None, :],
+            #block sparse data
+            kv_indices, kv_num_blocks,
+            block_n_start, block_n_end if block_n_end <= block_n_last_valid else block_n_last_valid,
+            MATMUL_PRECISION,
+            IS_FULL_BLOCKS=True,
+        )
+
+    m_offset = off_t * stride_mt + off_z * stride_mz
+    l_offset = off_t * stride_lt + off_z * stride_lz
+
+    M_block_ptr = tl.make_block_ptr(
+        base=M + m_offset,
+        shape=(G, Q_LEN),                   # (G, M)
+        strides=(stride_mh, stride_mm),
+        offsets=(off_hkv*G, 0),
+        block_shape=(G, BLOCK_M_PER_HQ),
+        order=(1, 0)
+    )
+    L_block_ptr = tl.make_block_ptr(
+        base=L + l_offset,
+        shape=(G, Q_LEN),                   # (G, M)
+        strides=(stride_lh, stride_lm),
+        offsets=(off_hkv*G, 0),
+        block_shape=(G, BLOCK_M_PER_HQ),
+        order=(1, 0)
+    )
+
+    # Store output, logsumexp and rowmax for cross CTA reduction. (all in float32, even when input data are in fp16)
+    m_i = m_i.reshape(G, BLOCK_M_PER_HQ)
+    l_i = l_i.reshape(G, BLOCK_M_PER_HQ)
+    if SAFE_M_BOUNDARY:
+        tl.store(M_block_ptr, m_i)
+        tl.store(L_block_ptr, l_i)
+    else:
+        tl.store(M_block_ptr, m_i, boundary_check=(1,))
+        tl.store(L_block_ptr, l_i, boundary_check=(1,))
+
+    # -- store output
+    idx_z = off_z
+    idx_t = off_t
+    idx_hq = off_hkv*G + off_g[:, None, None]
+    idx_m = off_m[None, :, None]
+    idx_d = offs_vd[None, None, :]
+
+    mask = (idx_m < Q_LEN)
+    acc = acc.reshape(G, BLOCK_M_PER_HQ, V_HEAD_DIM)
+    {{store_output(("idx_z", "idx_t", "idx_hq", "idx_m", "idx_d"), "acc", "mask")}}
+ """
+    + compute_forward_inner
+    + compute_next_offset_func
+    + compute_forward_block_mn,
+)
+
+
+def get_split_k(B: int, H: int, Mk: int, SM: int = 128) -> int:
+    """Heuristic for the number of splits from xformer"""
+    bh = max(B * H, 1)  # NOTE: Handle B*h=0 case
+    split_k = SM // bh  # Each SM should at least get one block.
+    split_k = max(split_k, 1)
+
+    return split_k
+
+
+def _get_decoding_default_config(key) -> Tuple[int, int, int]:
+    dtype = key.get_dtype()
+    head_dim = key.get_size()[-1]
+    sm_version = torch.cuda.get_device_capability()
+    default_config = (64, 2, 1)
+    if sm_version >= (9, 0):
+        if head_dim > 128 and dtype == torch.float32:
+            return default_config
+        return (64, 2, 3)
+    return default_config
+
+
+def create_flex_decoding_kernel(*args, **kwargs):
+    (
+        query,
+        key,
+        value,
+        block_mask,
+        scale,
+        kernel_options,
+        score_mod_subgraph,
+        mask_mod_subgraph,
+        score_mod_other_buffers,
+        mask_mod_other_buffers,
+    ) = args
+    (
+        kv_num_blocks,
+        kv_indices,
+        full_kv_num_blocks,  # full_kv_num_blocks,
+        full_kv_indices,  # full_kv_indices,
+        _,  # q_num_blocks
+        _,  # q_indices
+        _,  # full_q_num_blocks,
+        _,  # full_q_indices,
+        SPARSE_KV_BLOCK_SIZE,
+        _,  # SPARSE_Q_BLOCK_SIZE,
+        _,
+    ) = block_mask
+
+    Bq, Hq, seq_len_q, qk_head_dim = query.get_size()
+    Bkv, Hkv, seq_len_kv, v_head_dim = value.get_size()
+    assert Bq == Bkv, "Batch dimension must match"
+    B = Bq
+    kernel_options = dict(kernel_options)
+
+    # TODO: Fix flex decoding non-divisible case!
+    if seq_len_q % 128 != 0 or seq_len_kv % 128 != 0:
+        kernel_options.setdefault("IS_DIVISIBLE", False)
+    else:
+        kernel_options.setdefault("IS_DIVISIBLE", True)
+
+    # Calculate GQA head sharing
+    gqa_shared_heads = Hq // Hkv
+    if not is_power_of_2(gqa_shared_heads):
+        raise ValueError(
+            "Number of shared query heads sharing the same KV head must be power of 2. "
+        )
+    kernel_options.setdefault("GQA_SHARED_HEADS", gqa_shared_heads)
+
+    # Determine if there are "full" blocks where we only need to apply score_mod, and can skip mask_mod
+    has_full_blocks = full_kv_num_blocks is not None
+    kernel_options.setdefault("HAS_FULL_BLOCKS", has_full_blocks)
+    if not has_full_blocks:
+        # Create a plackeholder full block list in case it is empty
+        full_kv_num_blocks, full_kv_indices = (
+            empty(0, device=query.get_device()) for _ in range(2)
+        )
+
+    (
+        query,
+        key,
+        value,
+        kv_num_blocks,
+        kv_indices,
+        full_kv_num_blocks,
+        full_kv_indices,
+    ) = maybe_realize(
+        [
+            query,
+            key,
+            value,
+            kv_num_blocks,
+            kv_indices,
+            full_kv_num_blocks,
+            full_kv_indices,
+        ]
+    )
+
+    choices: List[Any] = []
+    configs: List[Tuple[int, int, int]] = []
+    configs.append(_get_decoding_default_config(key))
+    # Note: max_autotune is not supported yet. Causes error in lowering the dynamic shape in reduction ops.
+    if config.max_autotune:
+        configs += [
+            (64, 2, 2),
+            (32, 2, 3),
+            (128, 2, 3),
+        ]
+    # TODO: fix autotuning.
+
+    kernel_options.setdefault("SM_SCALE", scale)
+    kernel_options.setdefault("SPLIT_KV", get_split_k(B, Hkv, seq_len_kv))
+    MAX_SPLIT_KV = kernel_options["SPLIT_KV"]
+
+    # create config dependent intermediate buffers
+    buf_ACC_shape = [B, MAX_SPLIT_KV, Hq, seq_len_q, v_head_dim]
+    buf_ML_shape = buf_ACC_shape[:-1]
+    buf_M = empty_strided(
+        buf_ML_shape,
+        None,
+        dtype=torch.float32,  # The rowmax is always stored in fp32 regardless of the input dtype
+        device=query.get_device(),
+    )
+    buf_L = empty_strided(
+        buf_ML_shape,
+        None,
+        dtype=torch.float32,  # The intermediate sumexp is always stored in fp32 regardless of the input dtype
+        device=query.get_device(),
+    )
+
+    layout_acc = FixedLayout(
+        query.get_device(),
+        torch.float32,
+        buf_ACC_shape,
+        FlexibleLayout.contiguous_strides(buf_ACC_shape),
+    )
+
+    kernel_options.setdefault("QK_HEAD_DIM", qk_head_dim)
+    kernel_options.setdefault("V_HEAD_DIM", v_head_dim)
+
+    kernel_options.setdefault(
+        "BLOCK_M",
+        (
+            # m
+            # if V.graph.sizevars.evaluate_expr(sympy.Lt(query.get_size()[-2], 0))
+            # else  # Always use a BLOCK_M > 16 before Triton fix https://github.com/triton-lang/triton/pull/4061 is in pin
+            max(
+                next_power_of_2(
+                    V.graph.sizevars.size_hint(
+                        seq_len_q, fallback=torch._inductor.config.unbacked_symint_fallback  # type: ignore[arg-type]
+                    )
+                    * gqa_shared_heads
+                ),
+                16,
+            )
+        ),
+    )
+
+    query = ir.ExternKernel.realize_input(query)
+    stride_b, stride_hq, stride_seq_len_q, stride_qk_head_dim = query.get_stride()
+
+    # Reshape query for GQA: [B, Hq, Mq, D] -> [B, Hkv, G, Mq, D]
+    gqa_query_shape = (B, Hkv, gqa_shared_heads, seq_len_q, qk_head_dim)
+    gqa_query_stride = (
+        stride_b,
+        stride_hq * gqa_shared_heads,
+        stride_hq,
+        stride_seq_len_q,
+        stride_qk_head_dim,
+    )
+    query = lowerings[aten.as_strided](query, gqa_query_shape, gqa_query_stride)
+
+    V.graph.sizevars.guard_leq(
+        seq_len_q * gqa_shared_heads, sympy.Integer(kernel_options["BLOCK_M"])
+    )
+
+    kernel_options.setdefault(
+        "SAFE_M_BOUNDARY",
+        ((seq_len_q * gqa_shared_heads) % kernel_options["BLOCK_M"]) == 0,
+    )
+    # TODO: This feels sketchy
+    kernel_options.setdefault("SAFE_N_BOUNDARY", True)
+
+    # Note, we don't need to pass in the captured buffers explicitly
+    # because they're implicitly added by the score_mod function
+    # We do need to explicitly pass it in for autotuning though.
+    for BLOCK_N, num_warps, num_stages in configs:
+        if SPARSE_KV_BLOCK_SIZE % BLOCK_N != 0:
+            continue
+
+        # Performance tuning
+        kernel_options.setdefault("BLOCK_N", BLOCK_N)
+        kernel_options.setdefault("SPARSE_KV_BLOCK_SIZE", SPARSE_KV_BLOCK_SIZE)
+
+        # Work around https://github.com/pytorch/pytorch/issues/129625
+        if num_stages == 2:
+            continue
+        flex_decoding_template.maybe_append_choice(
+            choices=choices,
+            input_nodes=[
+                query,
+                key,
+                value,
+                buf_M,
+                buf_L,
+                kv_num_blocks,
+                kv_indices,
+                full_kv_num_blocks,
+                full_kv_indices,
+            ],
+            layout=layout_acc,
+            subgraphs=[
+                score_mod_subgraph,
+                mask_mod_subgraph,
+            ],
+            mutated_inputs=[buf_M, buf_L],
+            num_stages=num_stages,
+            num_warps=num_warps,
+            call_sizes=query.get_size(),
+            **kernel_options,
+        )
+
+    inputs_for_flex_decoding = (
+        [
+            query,
+            key,
+            value,
+            buf_M,
+            buf_L,
+            kv_num_blocks,
+            kv_indices,
+            full_kv_num_blocks,
+            full_kv_indices,
+        ]
+        + list(score_mod_other_buffers)
+        + list(mask_mod_other_buffers)
+    )
+
+    input_gen_fns = {
+        5: create_num_blocks_fake_generator(kv_indices),
+        6: create_indices_fake,
+        7: create_num_blocks_fake_generator(full_kv_indices),
+        8: create_indices_fake,
+    }
+
+    buf_ACC = autotune_select_algorithm(
+        "flex_decoding",
+        choices,
+        inputs_for_flex_decoding,
+        layout_acc,
+        input_gen_fns=input_gen_fns,
+    )
+
+    # Reduction
+
+    g_M = lowerings[aten.max](buf_M, dim=1, keepdim=True)[0]
+    # See [Note] Handle fully masked out rows:
+    # g_M Is the global max among split kv blocks.
+    masked_rows = lowerings[aten.eq](g_M, -float("inf"))
+    adj_M = lowerings[aten.sub](buf_M, g_M)
+    adj_M = lowerings[aten.where](masked_rows, 0, adj_M)
+    alpha = lowerings[aten.exp2](adj_M)
+
+    buf_L = lowerings[aten.mul](buf_L, alpha)
+    g_L = lowerings[aten.sum](buf_L, axis=1)
+    masked_rows_squeezed = lowerings[aten.squeeze](masked_rows, dim=1)
+    g_L = lowerings[aten.where](masked_rows_squeezed, 1.0, g_L)
+    logsumexp = lowerings[aten.log2](g_L)
+    logsumexp = lowerings[aten.add](logsumexp, lowerings[aten.squeeze](g_M, dim=1))
+
+    alpha_unseq = lowerings[aten.unsqueeze](alpha, 4)
+    buf_ACC = lowerings[aten.mul](buf_ACC, alpha_unseq)
+    output = lowerings[aten.sum](buf_ACC, axis=1)
+    L_unseq = lowerings[aten.unsqueeze](g_L, 3)
+    output = lowerings[aten.div](output, L_unseq)
+    output = lowerings[prims.convert_element_type](output, query.get_dtype())
+
+    return (
+        output,
+        logsumexp,
+    )
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm.py b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2e09300f9b8a7eb6615618c8f34aebebe6e80a2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm.py
@@ -0,0 +1,776 @@
+# mypy: allow-untyped-defs
+import functools
+import logging
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch._inductor.autoheuristic.autoheuristic import AutoHeuristicSelectAlgorithm
+from torch._inductor.autoheuristic.autoheuristic_utils import (
+    AHContext,
+    context_add_strides,
+    context_add_using_tf32,
+    get_mixedmm_precondition,
+    mixed_mm_operations,
+    mm_operations,
+)
+from torch._inductor.codegen.cpp_gemm_template import CppPackedGemmTemplate
+from torch._inductor.virtualized import V
+
+from .. import config as inductor_config
+from ..codegen.common import BackendFeature
+from ..codegen.cuda.gemm_template import CUTLASS2xGemmTemplate, CUTLASS3xGemmTemplate
+from ..codegen.rocm.ck_universal_gemm_template import CKGemmTemplate
+from ..codegen.wrapper import WrapperCodeGen
+from ..ir import FlexibleLayout, is_triton
+from ..lowering import register_lowering
+from ..select_algorithm import (
+    autotune_select_algorithm,
+    ExternKernelChoice,
+    NoValidChoicesError,
+    TritonTemplate,
+)
+from ..utils import (
+    get_gpu_shared_memory,
+    use_aten_gemm_kernels,
+    use_ck_template,
+    use_cpp_packed_gemm_template,
+    use_cutlass_template,
+    use_max_autotune,
+    use_triton_template,
+)
+from .mm_common import (
+    addmm_epilogue,
+    extra_mm_configs,
+    int8_mm_configs,
+    mixed_mm_configs,
+    mm_args,
+    mm_configs,
+    mm_grid,
+    mm_options,
+    triton_config,
+)
+
+
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+
+mm_template = TritonTemplate(
+    name="mm",
+    grid=mm_grid,
+    source=r"""
+{{def_kernel("A", "B")}}
+    M = {{size("A", 0)}}
+    N = {{size("B", 1)}}
+    K = {{size("A", 1)}}
+    if M * N == 0:
+        # early exit due to zero-size input(s)
+        return
+    stride_am = {{stride("A", 0)}}
+    stride_ak = {{stride("A", 1)}}
+    stride_bk = {{stride("B", 0)}}
+    stride_bn = {{stride("B", 1)}}
+
+    # based on triton.ops.matmul
+    pid = tl.program_id(0)
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    if (stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1):
+        ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    else:
+        ram = rm % M
+    if (stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1):
+        rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    else:
+        rbn = rn % N
+    rk = tl.arange(0, BLOCK_K)
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(K, 0, -BLOCK_K):
+        if EVEN_K:
+            a = tl.load(A)
+            b = tl.load(B)
+        else:
+            a = tl.load(A, mask=rk[None, :] < k, other=0.)
+            b = tl.load(B, mask=rk[:, None] < k, other=0.)
+        if B_PROLOGUE_CAST_TYPE is not None:
+            b = b.to(B_PROLOGUE_CAST_TYPE)
+        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+
+    # rematerialize rm and rn to save registers
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    idx_m = rm[:, None]
+    idx_n = rn[None, :]
+    mask = (idx_m < M) & (idx_n < N)
+
+    # inductor generates a suffix
+    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
+""",
+)
+
+aten_mm = ExternKernelChoice(torch.mm, "at::mm_out")
+
+aten_addmm = ExternKernelChoice(
+    torch.addmm, "at::addmm_out", op_overload=aten.addmm.default
+)
+
+aten__int_mm = ExternKernelChoice(torch._int_mm, "at::_int_mm")
+
+aten__sparse_semi_structured_mm = ExternKernelChoice(
+    torch._sparse_semi_structured_mm,
+    "at::_sparse_semi_structured_mm",
+    has_out_variant=False,
+)
+
+
+def _is_int8_mat(mat):
+    return mat.get_dtype() in (torch.int8, torch.uint8)
+
+
+def bias_addmm(inp, mat1, mat2, *, out=None, alpha=1, beta=1):
+    """
+    Giving torch.addmm a 1D tensor calls a different (faster) cublasLt
+    kernel under the hood.  There are a few shapes where this is slower,
+    but they are rare.
+    """
+    if inp.stride(0) == 0 or inp.size(0) == 1:
+        return torch.addmm(inp[0], mat1, mat2, out=out, alpha=alpha, beta=beta)
+    return torch.addmm(inp, mat1, mat2, out=out, alpha=alpha, beta=beta)
+
+
+aten_bias_addmm = ExternKernelChoice(bias_addmm, None)
+
+
+@register_lowering(aten.mm, type_promotion_kind=None)
+def tuned_mm(mat1, mat2, *, layout=None):
+    m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
+    name = "mm"
+
+    aten_layout = layout
+    if not use_max_autotune():
+        aten_layout = FlexibleLayout(
+            device=layout.device, dtype=layout.dtype, size=layout.size
+        )
+
+    # options to tune from
+    choices = (
+        [aten_mm.bind((mat1, mat2), aten_layout)] if use_aten_gemm_kernels() else []
+    )
+    static_shape, is_nonzero = _is_static_problem([mat1, mat2], layout)
+    if is_nonzero and use_triton_template(layout):
+        for config in mm_configs(m, n, k):
+            mm_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+            )
+    if static_shape and is_nonzero and use_cutlass_template(layout, m, n, k):
+        CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(choices, layout, [mat1, mat2])
+
+    if is_nonzero and use_ck_template(layout, m, n, k):
+        CKGemmTemplate.add_ck_gemm_choices(choices, layout, [mat1, mat2])
+
+    if use_cpp_packed_gemm_template(layout, mat1, mat2):
+        CppPackedGemmTemplate.add_choices(
+            choices,
+            layout,
+            [mat1, mat2],
+        )
+
+    input_nodes = [mat1, mat2]
+    if (
+        is_nonzero
+        and use_triton_template(layout)
+        and torch._inductor.config.run_autoheuristic(name)
+        and is_triton(mat1)
+    ):
+        always_included = []
+        if use_aten_gemm_kernels():
+            always_included.append("extern_mm")
+        num_choices_before_extra_configs = len(choices)
+        for config in extra_mm_configs(m, n, k):
+            mm_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+            )
+
+        # using AutoHeuristic for ranking
+        ah_choices = mm_autoheuristic(
+            mat1,
+            mat2,
+            m,
+            n,
+            k,
+            choices,
+            name,
+            input_nodes,
+            mm_operations(),
+            None,
+            top_k=10,
+            always_included=always_included,
+        )
+        if not torch._inductor.config.collect_autoheuristic(name):
+            # if we are collecting data, we do not want to modify choices
+            if ah_choices is not None and len(ah_choices) > 0:
+                # the order in which autoheuristic returns choices is not the same as
+                # as the order of choices, which affects things like epilogue fusion.
+                # once epilogue fusion benchmarks choices in sorted order, I think we can
+                # just use the order returned by autoheuristic
+                choices = [choice for choice in choices if choice in ah_choices]
+            else:
+                choices = choices[:num_choices_before_extra_configs]
+
+    if (
+        len(choices) == 0
+        and not use_aten_gemm_kernels()
+        and inductor_config.autotune_fallback_to_aten
+    ):
+        log.warning("No choices for GEMM, using ATen backend as fallback")
+        return aten_mm.bind((mat1, mat2), aten_layout).output_node()
+
+    try:
+        return autotune_select_algorithm(name, choices, [mat1, mat2], layout)
+    except NoValidChoicesError:
+        if not inductor_config.autotune_fallback_to_aten:
+            raise
+        log.warning("All choices for GEMM were invalid, using ATen backend as fallback")
+        return aten_mm.bind((mat1, mat2), aten_layout).output_node()
+
+
+def _is_static_problem(inputs_tensors, layout):
+    # checks whether all input tensors and the output layout
+    # have a static shape by attempting to convert the dimensions
+    # to int
+    static_shape = True
+    static_size = WrapperCodeGen.statically_known_list_of_ints_or_none(layout.size)
+    if static_size is None:
+        nonzero = True
+        for s in layout.size:
+            sz = WrapperCodeGen.statically_known_int_or_none(s)
+            if sz is not None and sz == 0:
+                nonzero = False
+                break
+        return False, nonzero
+    numel = 1
+    for dim in static_size:
+        numel *= dim
+    nonzero = numel > 0
+    return static_shape, nonzero
+
+
+@register_lowering(aten._int_mm, type_promotion_kind=None)
+def tuned_int_mm(mat1, mat2, *, layout=None):
+    m, n, k, layout, mat1, mat2 = mm_args(
+        mat1, mat2, layout=layout, out_dtype=torch.int32
+    )
+    static_shape, is_nonzero = _is_static_problem([mat1, mat2], layout)
+    use_cutlass = static_shape and is_nonzero and use_cutlass_template(layout, m, n, k)
+
+    choices = (
+        [aten__int_mm.bind((mat1, mat2), layout)] if use_aten_gemm_kernels() else []
+    )
+
+    # TODO: Re-enable eager mode implementation once cuBLAS is fixed
+    if use_cutlass or use_triton_template(layout, enable_int32=True):
+        choices = []
+
+    if use_cutlass:
+        CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
+            choices, layout, [mat1, mat2], fuseable=True, non_fuseable=True
+        )
+    if is_nonzero and use_triton_template(layout, enable_int32=True):
+        for config in int8_mm_configs(m, n, k):
+            mm_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+            )
+    if len(choices) == 0:
+        log.warning(
+            "No choices for integer GEMM avaialbe using configured backends, using ATen backend as fallback"
+        )
+        choices = [aten__int_mm.bind((mat1, mat2), layout)]
+
+    try:
+        return autotune_select_algorithm("int_mm", choices, [mat1, mat2], layout)
+    except NoValidChoicesError:
+        if not inductor_config.autotune_fallback_to_aten:
+            raise
+        log.warning("All choices for GEMM were invalid, using ATen backend as fallback")
+        choices = [aten__int_mm.bind((mat1, mat2), layout)]
+        return autotune_select_algorithm("int_mm", choices, [mat1, mat2], layout)
+
+
+@register_lowering(aten.addmm, type_promotion_kind=None)
+def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
+    ordered_kwargs_for_cpp_kernel = ("beta", "alpha")
+    m, n, k, layout, mat1, mat2, inp_expanded = mm_args(mat1, mat2, inp, layout=layout)
+    static_shape, is_nonzero = _is_static_problem([inp, mat1, mat2], layout)
+    if (not is_nonzero) or (not use_max_autotune()):
+        # Use a FlexibleLayout if we are not autotuning.
+        # This allows padding strides for the output.
+        from torch._inductor.ir import FixedLayout, FlexibleLayout
+
+        if isinstance(layout, FixedLayout):
+            layout = FlexibleLayout(
+                device=layout.device, dtype=layout.dtype, size=layout.size
+            )
+        choices = (
+            [
+                aten_addmm.bind(
+                    (inp, mat1, mat2),
+                    layout,
+                    alpha=alpha,
+                    beta=beta,
+                )
+            ]
+            if use_aten_gemm_kernels()
+            else []
+        )
+        return autotune_select_algorithm("addmm", choices, [inp, mat1, mat2], layout)
+
+    choices = (
+        [
+            aten_addmm.bind(
+                (inp_expanded, mat1, mat2),
+                layout,
+                alpha=alpha,
+                beta=beta,
+            )
+        ]
+        if use_aten_gemm_kernels()
+        else []
+    )
+
+    if (
+        use_aten_gemm_kernels()
+        and inp_expanded.get_stride()[0] == 0
+        and inp_expanded.get_device().type == "cuda"
+        and inductor_config.triton.autotune_cublasLt
+    ):
+        # unexpand inp to make sure fused addmm from cublasLt is used
+        choices.insert(
+            0,
+            aten_bias_addmm.bind(
+                (inp_expanded, mat1, mat2), layout, alpha=alpha, beta=beta
+            ),
+        )
+
+    if is_nonzero and use_triton_template(layout):
+        for config in mm_configs(m, n, k):
+            mm_template.maybe_append_choice(
+                choices,
+                input_nodes=(inp_expanded, mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+                prefix_args=1,
+                epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
+            )
+
+    if static_shape and is_nonzero and use_cutlass_template(layout, m, n, k):
+        # Filter out a known cause of CUDA illegal memory access errors
+        # broadcasting on the last dim of the bias term seems not to be working
+        # in the linear GEMM epilogue used by addmm.
+        if (
+            WrapperCodeGen.statically_known_int_or_none(inp_expanded.layout.stride[-1])
+            != 0
+        ):
+            CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
+                choices,
+                layout,
+                [mat1, mat2, inp_expanded],
+                alpha=alpha,
+                beta=beta,
+            )
+
+    if is_nonzero and use_ck_template(layout, m, n, k):
+        CKGemmTemplate.add_ck_gemm_choices(
+            choices,
+            layout,
+            [mat1, mat2, inp_expanded],
+            alpha=alpha,
+            beta=beta,
+        )
+
+    if use_cpp_packed_gemm_template(layout, mat1, mat2):
+        CppPackedGemmTemplate.add_choices(
+            choices,
+            layout,
+            [inp_expanded, mat1, mat2],
+            alpha=alpha,
+            beta=beta,
+            has_bias=True,
+        )
+
+    add_aten_fallback = False
+    if len(choices) == 0:
+        log.warning("No choices for GEMM, using ATen backend as fallback")
+        add_aten_fallback = True
+
+    if add_aten_fallback:
+        choices.append(
+            aten_addmm.bind(
+                (inp_expanded, mat1, mat2),
+                layout,
+                ordered_kwargs_for_cpp_kernel,
+                alpha=alpha,
+                beta=beta,
+            )
+        )
+
+        if (
+            inp_expanded.get_stride()[0] == 0
+            and inp_expanded.get_device().type == "cuda"
+            and inductor_config.triton.autotune_cublasLt
+        ):
+            # unexpand inp to make sure fused addmm from cublasLt is used
+            choices.insert(
+                0,
+                aten_bias_addmm.bind(
+                    (inp_expanded, mat1, mat2), layout, alpha=alpha, beta=beta
+                ),
+            )
+    try:
+        return autotune_select_algorithm(
+            "addmm", choices, [inp_expanded, mat1, mat2], layout
+        )
+    except NoValidChoicesError:
+        if not inductor_config.autotune_fallback_to_aten:
+            raise
+        log.warning("All choices for GEMM were invalid, using ATen backend as fallback")
+        fallback_choice = aten_addmm.bind(
+            (inp, mat1, mat2),
+            layout,
+            ordered_kwargs_for_cpp_kernel,
+            alpha=alpha,
+            beta=beta,
+        )
+        return fallback_choice.output_node()
+
+
+@register_lowering(aten._sparse_semi_structured_mm, type_promotion_kind=None)
+def tuned_sparse_semi_structured_mm(
+    mat1, mat1_meta, mat2, *, out_dtype=None, layout=None
+):
+    from torch._inductor.select_algorithm import realize_inputs
+
+    mat1, mat1_meta, mat2 = realize_inputs(mat1, mat1_meta, mat2)
+    m1, k1 = mat1.get_size()
+    m2, _ = mat1_meta.get_size()
+    k2, n = mat2.get_size()
+    m = V.graph.sizevars.guard_equals(m1, m2)
+    k = V.graph.sizevars.guard_equals(2 * k1, k2)
+
+    if layout is None:
+        from torch._inductor.ir import FixedLayout
+
+        layout = FixedLayout(
+            mat2.get_device(),
+            out_dtype if out_dtype else mat2.get_dtype(),
+            [m, n],
+            [n, 1],
+        )
+    else:
+        assert out_dtype is None, "out_dtype is ignored if layout is specified."
+
+    choices = (
+        [
+            aten__sparse_semi_structured_mm.bind(
+                (mat1, mat1_meta, mat2), layout, out_dtype=out_dtype
+            )
+        ]
+        if use_aten_gemm_kernels()
+        else []
+    )
+
+    if m * n != 0 and use_cutlass_template(layout, m, n, k):
+        CUTLASS2xGemmTemplate.add_cutlass_gemm_choices(
+            choices, layout, [mat1, mat2, mat1_meta], fuseable=True, non_fuseable=True
+        )
+
+    return autotune_select_algorithm(
+        "sparse_semi_structured_mm", choices, [mat1, mat1_meta, mat2], layout
+    )
+
+
+def fallback_mixed_mm(mat1, mat2, *, out):
+    return torch.mm(mat1, mat2.to(mat1.dtype), out=out)
+
+
+aten_fallback_mixed_mm = ExternKernelChoice(fallback_mixed_mm, None)
+
+
+@functools.lru_cache(None)
+def _is_sm7x_or_older_gpu(index: Optional[int]) -> bool:
+    props = torch.cuda.get_device_properties(index or 0)
+    return props.major <= 7
+
+
+def dims_are_int(dims):
+    return all(isinstance(dim, int) for dim in dims)
+
+
+def try_heuristic(m, n, k, choices, mat1, mat2, mat2_dtype, layout):
+    m, n, k = get_size_hints(mat1, mat2, m, n, k)
+    if not dims_are_int([m, n, k]):
+        return None
+
+    if mat1.dtype != torch.float16:
+        return None
+
+    # only use heuristic if we are running on an A100
+    # torch.cuda.get_device_capability() >= (8, 0) returns true for A10G
+    # which does not have enough shared memory for one of the configs
+    if (
+        not torch.cuda.get_device_capability() >= (8, 0)
+    ) or get_gpu_shared_memory() != 166912:
+        return None
+
+    if m == 1 and (n % 16 != 0 or k % 16 != 0):
+        return None
+
+    if m <= 16 and n >= 4096 and k >= 4096:
+        return triton_config(
+            BLOCK_M=16,
+            BLOCK_N=64,
+            BLOCK_K=128,
+            num_stages=5,
+            num_warps=4,
+        )
+    elif m > 16 and m <= 32 and n >= 4096 and k >= 4096:
+        return triton_config(
+            BLOCK_M=32,
+            BLOCK_N=32,
+            BLOCK_K=128,
+            num_stages=5,
+            num_warps=4,
+        )
+    elif m > 32 and m <= 64 and n >= 4096 and k >= 4096:
+        return triton_config(
+            BLOCK_M=64,
+            BLOCK_N=32,
+            BLOCK_K=128,
+            num_stages=5,
+            num_warps=4,
+        )
+    return None
+
+
+def mm_autoheuristic(
+    mat1,
+    mat2,
+    m,
+    n,
+    k,
+    choices,
+    name,
+    input_nodes,
+    ops,
+    precondition,
+    top_k: Optional[int] = None,
+    always_included=None,
+):
+    m, n, k = get_size_hints(mat1, mat2, m, n, k)
+    if not dims_are_int([m, n, k]):
+        return None
+    mat1_stride, mat2_stride = get_size_hints_strides(mat1, mat2)
+
+    def get_context(m, k, n, mat1, mat2, mat1_stride, mat2_stride):
+        context = AHContext()
+        context.add_feature("m", m)
+        context.add_feature("k", k)
+        context.add_feature("n", n)
+        context.add_feature("mat1_dtype", mat1.layout.dtype, is_categorical=True)
+        context.add_feature("mat2_dtype", mat2.layout.dtype, is_categorical=True)
+        context_add_strides(context, "mat1", mat1_stride)
+        context_add_strides(context, "mat2", mat2_stride)
+        context.add_feature(
+            "mat1_iscontig", mat1.layout.is_contiguous(), is_categorical=True
+        )
+        context.add_feature(
+            "mat2_iscontig", mat2.layout.is_contiguous(), is_categorical=True
+        )
+        if name == "mm":
+            # for mixed_mm, we only consider fp16
+            context_add_using_tf32(context, mat1.layout.dtype)
+        return context
+
+    def fallback():
+        return None
+
+    context = get_context(m, k, n, mat1, mat2, mat1_stride, mat2_stride)
+    autoheuristic = AutoHeuristicSelectAlgorithm(
+        fallback=fallback,
+        choices=choices,
+        input_nodes=input_nodes,
+        context=context,
+        name=name,
+        augment_context=ops,
+        precondition=precondition,
+    )
+
+    if top_k is not None:
+        # TODO: is there a cleaner way to ensure aten.mm is always included?
+        return autoheuristic.get_top_k_choices_caller(
+            top_k, always_included=always_included
+        )
+
+    return autoheuristic.get_choice_caller()
+
+
+def get_size_hints(mat1, mat2, m, n, k):
+    if not isinstance(m, int) or not isinstance(k, int):
+        (m, k) = V.graph.sizevars.size_hints(
+            mat1.get_size(),
+            fallback=torch._inductor.config.unbacked_symint_fallback,
+        )
+
+    if not isinstance(n, int) or not isinstance(k, int):
+        (k, n) = V.graph.sizevars.size_hints(
+            mat2.get_size(),
+            fallback=torch._inductor.config.unbacked_symint_fallback,
+        )
+    return m, n, k
+
+
+def get_size_hints_strides(mat1, mat2):
+    mat1_stride = mat1.layout.stride
+    mat2_stride = mat2.layout.stride
+    strides = [mat1_stride, mat2_stride]
+    strides_hints = []
+    for stride in strides:
+        if not isinstance(stride, int):
+            stride = V.graph.sizevars.size_hints(
+                stride,
+                fallback=torch._inductor.config.unbacked_symint_fallback,
+            )
+        strides_hints.append(stride)
+    return strides_hints[0], strides_hints[1]
+
+
+def tuned_mixed_mm(mat1, mat2, mat2_dtype):
+    m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=None)
+    static_shape, is_nonzero = _is_static_problem([mat1, mat2], layout)
+
+    fallback = aten_fallback_mixed_mm.bind((mat1, mat2), layout)
+
+    choices = [fallback]
+
+    # can't use triton kernel unless one of these is true or if running on v100 (numerical issues)
+    skip_triton = (
+        (
+            mat1.layout.dtype != torch.float32
+            and not (mat2.layout.is_contiguous() or mat2.layout.is_transposed())
+        )
+        or _is_sm7x_or_older_gpu(layout.device.index)
+        or inductor_config.mixed_mm_choice == "aten"
+        or not V.graph.has_feature(layout.device, BackendFeature.TRITON_TEMPLATES)
+        or (
+            mat1.layout.dtype == torch.float32 and torch.backends.cuda.matmul.allow_tf32
+        )
+        or (mat1.layout.dtype == torch.bfloat16 and mat2.layout.dtype == torch.uint8)
+    )
+
+    if inductor_config.mixed_mm_choice == "triton":
+        choices = []
+
+    if not skip_triton:
+        b_prologue_cast_type = f"tl.{mat2_dtype}".replace("torch.", "")
+        if static_shape and inductor_config.mixed_mm_choice == "heuristic":
+            choices = []
+            config = try_heuristic(m, n, k, choices, mat1, mat2, mat2_dtype, layout)
+            if config is not None:
+                mm_template.maybe_append_choice(
+                    choices,
+                    input_nodes=(mat1, mat2),
+                    layout=layout,
+                    **mm_options(config, m, n, k, layout, b_prologue_cast_type),
+                )
+            choices.append(fallback)
+
+        has_int8_tensor = _is_int8_mat(mat1) or _is_int8_mat(mat2)
+        for config in mixed_mm_configs(m, n, k, has_int8_tensor=has_int8_tensor):
+            mm_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout, b_prologue_cast_type),
+            )
+
+    if static_shape and is_nonzero and use_cutlass_template(layout, m, n, k):
+        CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
+            choices, layout, [mat1, mat2], fuseable=True, non_fuseable=True
+        )
+        CUTLASS2xGemmTemplate.add_cutlass_gemm_choices(
+            choices, layout, [mat1, mat2], fuseable=True, non_fuseable=True
+        )
+
+    if skip_triton and not choices:
+        choices = [fallback]
+
+    name = "mixed_mm"
+    input_nodes = [mat1, mat2]
+    if torch._inductor.config.run_autoheuristic(name):
+        choice = mm_autoheuristic(
+            mat1,
+            mat2,
+            m,
+            n,
+            k,
+            choices,
+            name,
+            input_nodes,
+            mixed_mm_operations(),
+            get_mixedmm_precondition,
+        )
+        if (
+            not skip_triton
+            and inductor_config.mixed_mm_choice == "heuristic"
+            and choice is not None
+        ):
+            choices.insert(0, choice)
+    return autotune_select_algorithm(name, choices, input_nodes, layout)
+
+
+# This op is a special case of the int_mm op which we use based on the pattern
+# _int_mm -> mul (defined in ../fx_passes/post_grad.py) in order to prevent
+# realization of the int32 _int_mm output by forcing fusion with the mul op.
+# This is only used when config.force_fuse_int_mm_with_mul = True
+def tuned_fused_int_mm_mul(mat1, mat2, mat3, out_dtype, *, layout=None):
+    out_dtype = (
+        torch.promote_types(mat3.get_dtype(), torch.int32)
+        if out_dtype is None
+        else out_dtype
+    )
+    m, n, k, layout, mat1, mat2, mat3 = mm_args(
+        mat1, mat2, mat3, layout=layout, out_dtype=out_dtype
+    )
+    choices: List[Dict[Any, Any]] = []
+    for config in int8_mm_configs(m, n, k):
+        mm_template.maybe_append_choice(
+            choices,
+            input_nodes=(mat1, mat2, mat3),
+            layout=layout,
+            **dict(mm_options(config, m, n, k, layout), ACC_TYPE="tl.int32"),
+            suffix_args=1,
+            epilogue_fn=V.ops.mul,
+        )
+    return autotune_select_algorithm("int_mm", choices, [mat1, mat2, mat3], layout)
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_common.py b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c66da4ca4f314e09e996c2d9860c8f3b9365086
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_common.py
@@ -0,0 +1,466 @@
+# mypy: allow-untyped-defs
+import functools
+import itertools
+import logging
+from typing import cast, List, Tuple
+
+import sympy
+
+import torch
+from torch._inductor.select_algorithm import realize_inputs
+from torch._inductor.virtualized import V
+
+from .. import config as inductor_config
+from ..runtime.runtime_utils import next_power_of_2
+from ..utils import ceildiv as cdiv
+
+
+log = logging.getLogger(__name__)
+
+
+def triton_config(num_stages, num_warps, **kwargs):
+    from triton import Config
+
+    return Config(kwargs, num_stages=num_stages, num_warps=num_warps)
+
+
+def filtered_configs(
+    m: int,
+    n: int,
+    k: int,
+    configs: List[Tuple[int, int, int, int, int]],
+    has_int8_tensor=False,
+):
+    """Heuristic to shrink configs when they are bigger than the input size"""
+
+    min_block_size = 16
+    # block_k=16 seems to be causing issues
+    # see: https://github.com/triton-lang/triton/issues/2156#issuecomment-1695897424
+    min_block_size_k = 32 if has_int8_tensor else 16
+    m = max(
+        next_power_of_2(
+            V.graph.sizevars.size_hint(
+                m, fallback=torch._inductor.config.unbacked_symint_fallback  # type: ignore[arg-type]
+            )
+        ),
+        min_block_size,
+    )
+    n = max(
+        next_power_of_2(
+            V.graph.sizevars.size_hint(
+                n, fallback=torch._inductor.config.unbacked_symint_fallback  # type: ignore[arg-type]
+            )
+        ),
+        min_block_size,
+    )
+    k = max(
+        next_power_of_2(
+            V.graph.sizevars.size_hint(
+                k, fallback=torch._inductor.config.unbacked_symint_fallback  # type: ignore[arg-type]
+            )
+        ),
+        min_block_size_k,
+    )
+    used = set()
+    for block_m, block_n, block_k, num_stages, num_warps in configs:
+        # shrink configs for small sizes
+        block_m = max(min(block_m, m), min_block_size)
+        block_n = max(min(block_n, n), min_block_size)
+        block_k = max(min(block_k, k), min_block_size_k)
+        # each warp computes 16x16 tile = 256
+        num_warps = min(num_warps, block_m * block_n // 256)
+        if torch.version.hip:
+            for matrix_instr_nonkdim in [0, 16]:
+                if matrix_instr_nonkdim != 0 and (
+                    block_m % matrix_instr_nonkdim != 0
+                    or block_n % matrix_instr_nonkdim != 0
+                ):
+                    #  block_m and block_n must be a multiple of matrix_instr_nonkdim
+                    continue
+                if (
+                    block_m,
+                    block_n,
+                    block_k,
+                    num_stages,
+                    num_warps,
+                    matrix_instr_nonkdim,
+                ) not in used:
+                    used.add(
+                        (
+                            block_m,
+                            block_n,
+                            block_k,
+                            num_stages,
+                            num_warps,
+                            matrix_instr_nonkdim,
+                        )
+                    )
+                    yield triton_config(
+                        BLOCK_M=block_m,
+                        BLOCK_N=block_n,
+                        BLOCK_K=block_k,
+                        num_stages=num_stages,
+                        num_warps=num_warps,
+                        matrix_instr_nonkdim=matrix_instr_nonkdim,
+                    )
+        else:
+            if (block_m, block_n, block_k, num_stages, num_warps, 0) not in used:
+                used.add((block_m, block_n, block_k, num_stages, num_warps, 0))
+                yield triton_config(
+                    BLOCK_M=block_m,
+                    BLOCK_N=block_n,
+                    BLOCK_K=block_k,
+                    num_stages=num_stages,
+                    num_warps=num_warps,
+                )
+
+
+# List of dictionaries to store the kernel configs. Configs that evaluate to true
+# will be utilised on the target platform. The configs are as follows:
+# (BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps)
+mm_kernel_configs = (
+    [
+        {"config": (32, 32, 16, 1, 2), "cond": True},
+        {"config": (32, 32, 128, 2, 4), "cond": torch.version.hip is None},
+        {"config": (32, 64, 32, 5, 8), "cond": True},
+        {"config": (64, 32, 32, 5, 8), "cond": True},
+        {"config": (64, 32, 128, 5, 4), "cond": True},
+        {"config": (64, 64, 16, 2, 4), "cond": True},
+        {"config": (64, 64, 32, 2, 4), "cond": True},
+        {"config": (64, 64, 64, 3, 8), "cond": True},
+        {"config": (64, 64, 128, 5, 4), "cond": True},
+        {"config": (64, 128, 32, 3, 4), "cond": True},
+        {"config": (64, 128, 32, 4, 8), "cond": True},
+        {"config": (64, 128, 64, 3, 4), "cond": True},
+        {"config": (64, 128, 128, 4, 4), "cond": True},
+        {"config": (128, 64, 32, 3, 4), "cond": True},
+        {"config": (128, 64, 32, 4, 8), "cond": True},
+        {"config": (128, 128, 32, 2, 8), "cond": True},
+        {"config": (128, 128, 32, 3, 4), "cond": True},
+        {"config": (128, 128, 64, 3, 4), "cond": True},
+        {"config": (128, 128, 64, 5, 8), "cond": True},
+    ]
+    if inductor_config.max_autotune_gemm_search_space != "EXHAUSTIVE"
+    else [
+        {"config": (BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps), "cond": True}
+        for BLOCK_M, BLOCK_N, BLOCK_K in itertools.product(
+            [16, 32, 64, 128, 256], repeat=3
+        )
+        for num_stages in [1, 2, 3, 4, 5]
+        for num_warps in [2, 4, 8]
+    ]
+)
+
+# these are only used in tuned_mm when AutoHeuristic is enabled
+# the idea is that when AutoHeuristic collects data to learn a heuristic, more configs are autotuned
+# when the learned heuristic is used, the learned heuristic reduces the number of configs down to 10
+# which saves compilation time (since less configs are autotuned) and potentially increase performance
+# because the learned heuristic might predict a config that is not part mm_configs
+extra_mm_kernel_configs = [
+    {"config": (16, 32, 16, 3, 2), "cond": True},
+    {"config": (16, 32, 32, 4, 2), "cond": True},
+    {"config": (16, 32, 32, 5, 2), "cond": True},
+    {"config": (64, 64, 128, 3, 4), "cond": True},
+    {"config": (128, 64, 32, 2, 2), "cond": True},
+    {"config": (128, 64, 64, 3, 8), "cond": True},
+    {"config": (128, 64, 128, 4, 8), "cond": True},
+    {"config": (128, 128, 32, 4, 4), "cond": True},
+    {"config": (128, 128, 64, 3, 8), "cond": True},
+    {"config": (128, 128, 64, 5, 4), "cond": True},
+]
+
+int8_mm_kernel_configs = [
+    {"config": (64, 64, 32, 2, 4), "cond": True},
+    {"config": (64, 128, 32, 3, 4), "cond": True},
+    {"config": (128, 64, 32, 3, 4), "cond": True},
+    {"config": (64, 128, 32, 4, 8), "cond": True},
+    {"config": (128, 64, 32, 4, 8), "cond": True},
+    {"config": (64, 32, 32, 5, 8), "cond": True},
+    {"config": (32, 64, 32, 5, 8), "cond": True},
+    {"config": (128, 128, 32, 2, 8), "cond": True},
+    {"config": (64, 64, 64, 3, 8), "cond": True},
+    # {"config": (32, 32, 128, 2, 4), "cond": True},
+    # {"config": (64, 64, 16, 2, 4), "cond": True},
+    # {"config": (32, 32, 16, 1, 2), "cond": True},
+    {"config": (128, 256, 128, 3, 8), "cond": torch.version.hip is None},
+    {"config": (256, 128, 128, 3, 8), "cond": torch.version.hip is None},
+]
+
+# Mixed precision kernel configs for small sizes of m for mm's like (16, 8192) x (8192, 8192).
+mixed_mm_kernel_configs_small_m = [
+    {"config": (16, 128, 256, 3, 4), "cond": True},
+    {"config": (16, 128, 256, 5, 8), "cond": True},
+]
+
+mixed_mm_kernel_configs = (
+    mm_kernel_configs + mixed_mm_kernel_configs_small_m
+    if inductor_config.max_autotune_gemm_search_space != "EXHAUSTIVE"
+    else mm_kernel_configs
+)
+
+scaled_mm_kernel_configs = [
+    {"config": (128, 256, 32, 3, 8), "cond": True},
+    {"config": (256, 128, 32, 3, 8), "cond": True},
+    {"config": (256, 64, 32, 4, 4), "cond": True},
+    {"config": (64, 256, 32, 4, 4), "cond": True},
+    {"config": (128, 128, 32, 4, 4), "cond": True},
+    {"config": (128, 64, 32, 4, 4), "cond": True},
+    {"config": (64, 128, 32, 4, 4), "cond": True},
+    {"config": (128, 32, 32, 4, 4), "cond": True},
+    {"config": (64, 32, 32, 5, 2), "cond": True},
+    {"config": (256, 128, 128, 3, 8), "cond": True},
+    {"config": (256, 64, 128, 4, 4), "cond": True},
+    {"config": (64, 256, 128, 4, 4), "cond": True},
+    {"config": (128, 128, 128, 4, 4), "cond": True},
+    {"config": (128, 64, 64, 4, 4), "cond": True},
+    {"config": (64, 128, 64, 4, 4), "cond": True},
+    {"config": (128, 32, 64, 4, 4), "cond": True},
+    {"config": (64, 32, 64, 5, 2), "cond": True},
+    {"config": (16, 32, 32, 2, 2), "cond": True},
+    {"config": (16, 64, 32, 2, 2), "cond": True},
+    {"config": (16, 128, 32, 2, 4), "cond": True},
+    {"config": (16, 256, 32, 2, 4), "cond": True},
+    {"config": (16, 32, 64, 2, 2), "cond": True},
+    {"config": (16, 64, 64, 2, 2), "cond": True},
+    {"config": (16, 128, 64, 2, 4), "cond": True},
+    {"config": (16, 256, 64, 2, 4), "cond": True},
+    {"config": (32, 32, 32, 2, 2), "cond": True},
+    {"config": (32, 64, 32, 2, 2), "cond": True},
+    {"config": (32, 128, 32, 2, 4), "cond": True},
+    {"config": (32, 256, 32, 2, 4), "cond": True},
+    {"config": (32, 32, 64, 2, 2), "cond": True},
+    {"config": (32, 64, 64, 2, 2), "cond": True},
+    {"config": (32, 128, 64, 2, 4), "cond": True},
+    {"config": (32, 256, 64, 2, 4), "cond": True},
+    {"config": (16, 32, 32, 3, 2), "cond": True},
+    {"config": (16, 64, 32, 3, 2), "cond": True},
+    {"config": (16, 128, 32, 3, 4), "cond": True},
+    {"config": (16, 256, 32, 3, 4), "cond": True},
+    {"config": (16, 32, 64, 3, 2), "cond": True},
+    {"config": (16, 64, 64, 3, 2), "cond": True},
+    {"config": (16, 128, 64, 3, 4), "cond": True},
+    {"config": (16, 256, 64, 3, 4), "cond": True},
+    {"config": (32, 32, 32, 3, 2), "cond": True},
+    {"config": (32, 64, 32, 3, 2), "cond": True},
+    {"config": (32, 128, 32, 3, 4), "cond": True},
+    {"config": (32, 256, 32, 3, 4), "cond": True},
+    {"config": (32, 32, 64, 3, 2), "cond": True},
+    {"config": (32, 64, 64, 3, 2), "cond": True},
+    {"config": (32, 128, 64, 3, 4), "cond": True},
+    {"config": (32, 256, 64, 3, 4), "cond": True},
+    {"config": (16, 32, 32, 4, 2), "cond": True},
+    {"config": (16, 64, 32, 4, 2), "cond": True},
+    {"config": (16, 128, 32, 4, 4), "cond": True},
+    {"config": (16, 256, 32, 4, 4), "cond": True},
+    {"config": (16, 32, 64, 4, 2), "cond": True},
+    {"config": (16, 64, 64, 4, 2), "cond": True},
+    {"config": (16, 128, 64, 4, 4), "cond": True},
+    {"config": (16, 256, 64, 4, 4), "cond": True},
+    {"config": (32, 32, 32, 4, 2), "cond": True},
+    {"config": (32, 64, 32, 4, 2), "cond": True},
+    {"config": (32, 128, 32, 4, 4), "cond": True},
+    {"config": (32, 256, 32, 4, 4), "cond": True},
+    {"config": (32, 32, 64, 4, 2), "cond": True},
+    {"config": (32, 64, 64, 4, 2), "cond": True},
+    {"config": (32, 128, 64, 4, 4), "cond": True},
+    {"config": (32, 256, 64, 4, 4), "cond": True},
+    {"config": (16, 32, 32, 5, 2), "cond": True},
+    {"config": (16, 64, 32, 5, 2), "cond": True},
+    {"config": (16, 128, 32, 5, 4), "cond": True},
+    {"config": (16, 256, 32, 5, 4), "cond": True},
+    {"config": (16, 32, 64, 5, 2), "cond": True},
+    {"config": (16, 64, 64, 5, 2), "cond": True},
+    {"config": (16, 128, 64, 5, 4), "cond": True},
+    {"config": (16, 256, 64, 5, 4), "cond": True},
+    {"config": (32, 32, 32, 5, 2), "cond": True},
+    {"config": (32, 64, 32, 5, 2), "cond": True},
+    {"config": (32, 128, 32, 5, 4), "cond": True},
+    {"config": (32, 256, 32, 5, 4), "cond": True},
+    {"config": (32, 32, 64, 5, 2), "cond": True},
+    {"config": (32, 64, 64, 5, 2), "cond": True},
+    {"config": (32, 128, 64, 5, 4), "cond": True},
+    {"config": (32, 256, 64, 5, 4), "cond": True},
+    {"config": (16, 32, 32, 6, 2), "cond": True},
+    {"config": (16, 64, 32, 6, 2), "cond": True},
+    {"config": (16, 128, 32, 6, 4), "cond": True},
+    {"config": (16, 256, 32, 6, 4), "cond": True},
+    {"config": (16, 32, 64, 6, 2), "cond": True},
+    {"config": (16, 64, 64, 6, 2), "cond": True},
+    {"config": (16, 128, 64, 6, 4), "cond": True},
+    {"config": (16, 256, 64, 6, 4), "cond": True},
+    {"config": (32, 32, 32, 6, 2), "cond": True},
+    {"config": (32, 64, 32, 6, 2), "cond": True},
+    {"config": (32, 128, 32, 6, 4), "cond": True},
+    {"config": (32, 256, 32, 6, 4), "cond": True},
+    {"config": (32, 32, 64, 6, 2), "cond": True},
+    {"config": (32, 64, 64, 6, 2), "cond": True},
+    {"config": (32, 128, 64, 6, 4), "cond": True},
+    {"config": (32, 256, 64, 6, 4), "cond": True},
+]
+
+
+# Create filtered list of configs based on cond evaluation
+mm_platform_configs = tuple(
+    cast(Tuple[int, int, int, int, int], config["config"])
+    for config in mm_kernel_configs
+    if config["cond"]
+)
+extra_mm_platform_configs = tuple(
+    cast(Tuple[int, int, int, int, int], config["config"])
+    for config in extra_mm_kernel_configs
+    if config["cond"]
+)
+int8_platform_configs = tuple(
+    cast(Tuple[int, int, int, int, int], config["config"])
+    for config in int8_mm_kernel_configs
+    if config["cond"]
+)
+mixed_mm_platform_configs = tuple(
+    cast(Tuple[int, int, int, int, int], config["config"])
+    for config in mixed_mm_kernel_configs
+    if config["cond"]
+)
+scaled_mm_platform_configs = tuple(
+    cast(Tuple[int, int, int, int, int], config["config"])
+    for config in scaled_mm_kernel_configs
+    if config["cond"]
+)
+
+# On ROCm convert num_stages to 0 to enable software pipelining
+if torch.version.hip:
+    mm_platform_configs = tuple(
+        (config[0], config[1], config[2], 0, config[4])
+        for config in mm_platform_configs
+    )
+    extra_mm_platform_configs = tuple(
+        (config[0], config[1], config[2], 0, config[4])
+        for config in extra_mm_platform_configs
+    )
+    int8_platform_configs = tuple(
+        (config[0], config[1], config[2], 0, config[4])
+        for config in mm_platform_configs
+    )
+    mixed_mm_platform_configs = tuple(
+        (config[0], config[1], config[2], 0, config[4])
+        for config in mixed_mm_platform_configs
+    )
+    scaled_mm_platform_configs = tuple(
+        (config[0], config[1], config[2], 0, config[4])
+        for config in scaled_mm_platform_configs
+    )
+
+mm_configs = functools.partial(
+    filtered_configs,
+    configs=mm_platform_configs,
+)
+
+extra_mm_configs = functools.partial(
+    filtered_configs,
+    configs=extra_mm_platform_configs,
+)
+
+int8_mm_configs = functools.partial(
+    filtered_configs,
+    configs=int8_platform_configs,
+)
+
+mixed_mm_configs = functools.partial(
+    filtered_configs,
+    configs=mixed_mm_platform_configs,
+)
+
+scaled_mm_configs = functools.partial(
+    filtered_configs,
+    configs=scaled_mm_platform_configs,
+)
+
+
+def mm_grid(m, n, meta):
+    """
+    The CUDA grid size for matmul triton templates.
+    """
+    return (cdiv(m, meta["BLOCK_M"]) * cdiv(n, meta["BLOCK_N"]), 1, 1)
+
+
+def acc_type(dtype):
+    if dtype in (torch.float16, torch.bfloat16):
+        return "tl.float32"
+    return f"tl.{dtype}".replace("torch.", "")
+
+
+def mm_options(config, sym_m, sym_n, sym_k, layout, b_prologue_cast_type=None):
+    """
+    Common options to matmul triton templates.
+    """
+    even_k_symbolic = (
+        # it isn't worth guarding on this
+        sympy.gcd(sym_k, config.kwargs["BLOCK_K"])
+        == config.kwargs["BLOCK_K"]
+    )
+    allow_tf32 = torch.backends.cuda.matmul.allow_tf32 and (
+        not inductor_config.force_same_precision
+        or ((sym_m % 16) == 0 and (sym_n % 16) == 0 and (sym_k % 8) == 0)
+    )
+    return dict(
+        GROUP_M=8,
+        EVEN_K=even_k_symbolic,
+        ALLOW_TF32=allow_tf32,
+        ACC_TYPE=acc_type(layout.dtype),
+        B_PROLOGUE_CAST_TYPE=b_prologue_cast_type,
+        num_stages=config.num_stages,
+        num_warps=config.num_warps,
+        **config.kwargs,
+    )
+
+
+def mm_args(
+    mat1,
+    mat2,
+    *others,
+    layout=None,
+    out_dtype=None,
+    use_4x2_dim=False,
+    mat2_transposed=False,
+):
+    """
+    Common arg processing for mm,bmm,addmm,etc
+    """
+    mat1, mat2 = realize_inputs(mat1, mat2)
+    *b1, m, k1 = mat1.get_size()
+    if mat2_transposed:
+        *b2, n, k2 = mat2.get_size()
+    else:
+        *b2, k2, n = mat2.get_size()
+    b = [V.graph.sizevars.guard_equals(a, b) for a, b in zip(b1, b2)]
+    if use_4x2_dim:
+        k2 = k2 * 2
+    k = V.graph.sizevars.guard_equals(k1, k2)
+    if layout is None:
+        from torch._inductor.ir import FixedLayout
+
+        if out_dtype is None:
+            out_dtype = mat1.get_dtype()
+
+        layout = FixedLayout(
+            mat1.get_device(),
+            out_dtype,
+            [*b, m, n],
+        )
+    else:
+        assert out_dtype is None, "out_dtype is ignored if layout is specified."
+    from ..lowering import expand
+
+    others = [realize_inputs(expand(x, layout.size)) for x in others]
+
+    return [m, n, k, layout, mat1, mat2, *others]
+
+
+def addmm_epilogue(dtype, alpha, beta):
+    def epilogue(acc, bias):
+        if alpha != 1:
+            acc = V.ops.mul(acc, V.ops.constant(alpha, dtype))
+        if beta != 1:
+            bias = V.ops.mul(bias, V.ops.constant(beta, dtype))
+        return V.ops.add(acc, bias)
+
+    return epilogue
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_plus_mm.py b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_plus_mm.py
new file mode 100644
index 0000000000000000000000000000000000000000..f432cbb13b6f14d1429aa6a4df0579015ef46975
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_plus_mm.py
@@ -0,0 +1,248 @@
+# mypy: allow-untyped-defs
+import functools
+
+import torch
+
+from ..lowering import lowerings
+from ..select_algorithm import (
+    autotune_select_algorithm,
+    ExternKernelChoice,
+    TritonTemplate,
+)
+from ..utils import use_aten_gemm_kernels, use_triton_template
+from ..virtualized import V
+from .mm_common import mm_args, mm_grid, mm_options
+
+
+aten = torch.ops.aten
+
+aten_mm_plus_mm = ExternKernelChoice(
+    torch.ops.inductor._mm_plus_mm, "torch::inductor::_mm_plus_mm"
+)
+
+mm_plus_mm_template = TritonTemplate(
+    name="mm_plus_mm",
+    grid=mm_grid,
+    debug=False,
+    source=r"""
+{{def_kernel("A", "B", "C", "D")}}
+    M = {{size("A", 0)}}
+    N = {{size("B", 1)}}
+    K1 = {{size("A", 1)}}
+    if M * N == 0:
+        # early exit due to zero-size input(s)
+        return
+    # K2 = {{size("C", 1)}}
+    stride_am = {{stride("A", 0)}}
+    stride_ak = {{stride("A", 1)}}
+    stride_bk = {{stride("B", 0)}}
+    stride_bn = {{stride("B", 1)}}
+    stride_cm = {{stride("C", 0)}}
+    stride_ck = {{stride("C", 1)}}
+    stride_dk = {{stride("D", 0)}}
+    stride_dn = {{stride("D", 1)}}
+
+    # based on triton.ops.matmul
+    pid = tl.program_id(0)
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    if (((stride_am == 1 and stride_ak == M) or (stride_am == K1 and stride_ak == 1))
+        and ((stride_cm == 1 and stride_ck == M) or (stride_cm == K1 and stride_ck == 1))):
+        ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    else:
+        ram = rm % M
+
+    if (((stride_bk == 1 and stride_bn == K1) or (stride_bk == N and stride_bn == 1))
+        and ((stride_dk == 1 and stride_dn == K1) or (stride_dk == N and stride_dn == 1))):
+        rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    else:
+        rbn = rn % N
+
+    rk = tl.arange(0, BLOCK_K)
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    C = C + (ram[:, None] * stride_cm + rk[None, :] * stride_ck)
+    D = D + (rk[:, None] * stride_dk + rbn[None, :] * stride_dn)
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k1 in range(K1, 0, -BLOCK_K):
+        # First matmul with A @ B
+        if EVEN_K:
+            a = tl.load(A)
+            b = tl.load(B)
+        else:
+            a = tl.load(A, mask=rk[None, :] < k1, other=0.)
+            b = tl.load(B, mask=rk[:, None] < k1, other=0.)
+        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+
+    for k2 in range(K1, 0, -BLOCK_K):
+
+        # Second matmul with C @ D
+        if EVEN_K:
+            c = tl.load(C)
+            d = tl.load(D)
+        else:
+            c = tl.load(C, mask=rk[None, :] < k2, other=0.)
+            d = tl.load(D, mask=rk[:, None] < k2, other=0.)
+        acc += tl.dot(c, d, allow_tf32=ALLOW_TF32)
+        C += BLOCK_K * stride_ck
+        D += BLOCK_K * stride_dk
+
+
+    idx_m = rm[:, None]
+    idx_n = rn[None, :]
+    mask = (idx_m < M) & (idx_n < N)
+
+    # inductor generates a suffix
+    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
+""",
+)
+
+
+@functools.lru_cache(None)
+def mm_configs():
+    import triton
+
+    # List of dictionaries to store the kernel configs. Configs that evaluate to true
+    # will be utilised on the target platform
+    mm_triton_configs = [
+        {
+            "config": {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32},
+            "num_stages": 2,
+            "num_warps": 4,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32},
+            "num_stages": 3,
+            "num_warps": 8,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32},
+            "num_stages": 4,
+            "num_warps": 16,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32},
+            "num_stages": 4,
+            "num_warps": 8,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32},
+            "num_stages": 4,
+            "num_warps": 8,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32},
+            "num_stages": 1,
+            "num_warps": 8,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 64},
+            "num_stages": 1,
+            "num_warps": 8,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 128},
+            "num_stages": 1,
+            "num_warps": 8,
+            "cond": torch.version.hip is None,
+        },
+        {
+            "config": {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 16},
+            "num_stages": 2,
+            "num_warps": 4,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 16},
+            "num_stages": 1,
+            "num_warps": 2,
+            "cond": True,
+        },
+    ]
+
+    # Filter out configs in which cond evaluates to true
+    # On ROCm convert num_stages to 1 as pipelining provides no benefit
+    if torch.version.hip:
+        filtered_configs = [
+            triton.Config(c["config"], num_stages=1, num_warps=c["num_warps"])
+            for c in mm_triton_configs
+            if c["cond"]
+        ]
+    else:
+        filtered_configs = [
+            triton.Config(
+                c["config"], num_stages=c["num_stages"], num_warps=c["num_warps"]
+            )
+            for c in mm_triton_configs
+            if c["cond"]
+        ]
+
+    return filtered_configs
+
+
+def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None):
+    """
+    Computes mm(mat1, mat2) + mm(mat3, mat4)
+    """
+    m1, n1, k1, layout1, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
+    m2, n2, _, layout2, mat3, mat4 = mm_args(mat3, mat4, layout=layout)
+    # Optimization is optional, because we can always just not do the fusion
+    if (
+        m1 * n1 == 0
+        or m2 * n2 == 0
+        or not V.graph.sizevars.statically_known_list_equals(
+            mat1.get_size(), mat3.get_size()
+        )
+        or not V.graph.sizevars.statically_known_list_equals(
+            mat2.get_size(), mat4.get_size()
+        )
+    ):
+        # TODO(jansel): support different K values when this is fixed:
+        # https://github.com/openai/triton/issues/967
+        return lowerings[aten.add](
+            lowerings[aten.mm](mat1, mat2), lowerings[aten.mm](mat3, mat4)
+        )
+
+    assert layout1 == layout2
+    # options to tune from
+    choices = (
+        [aten_mm_plus_mm.bind((mat1, mat2, mat3, mat4), layout1)]
+        if use_aten_gemm_kernels()
+        else []
+    )
+    if use_triton_template(layout1):
+        for config in mm_configs():
+            # see https://github.com/openai/triton/issues/1298
+            # BLOCK_K = K causes llvm error
+            if V.graph.sizevars.statically_known_lt(config.kwargs["BLOCK_K"], k1):
+                mm_plus_mm_template.maybe_append_choice(
+                    choices,
+                    input_nodes=(mat1, mat2, mat3, mat4),
+                    layout=layout1,
+                    **mm_options(config, m1, n1, k1, layout1),
+                )
+
+    return autotune_select_algorithm(
+        "mm_plus_mm", choices, [mat1, mat2, mat3, mat4], layout1
+    )
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_scaled.py b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_scaled.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f0d020716a199c802d07a19e906e20abb7e04ef
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_scaled.py
@@ -0,0 +1,311 @@
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+
+import sympy
+
+import torch
+
+from .. import config as inductor_config
+from ..ir import ChoiceCaller, Layout, StorageBox, TensorBox
+from ..lowering import add_layout_constraint, constrain_to_fx_strides, register_lowering
+from ..select_algorithm import (
+    autotune_select_algorithm,
+    ExternKernelChoice,
+    NoValidChoicesError,
+    realize_inputs,
+    TritonTemplate,
+)
+from ..utils import use_aten_gemm_kernels, use_triton_template
+from .mm import _is_static_problem  # TODO(yangsiyu) move to mm_common
+from .mm_common import mm_args, mm_grid, scaled_mm_configs
+
+
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+
+
+scaled_mm_template = TritonTemplate(
+    name="scaled_mm",
+    grid=mm_grid,
+    source=r"""
+{{def_kernel("A", "B", "A_inverse_scale", "B_inverse_scale")}}
+    M = {{size("A", 0)}}
+    N = {{size("B", 1)}}
+    K = {{size("A", 1)}}
+    if M * N == 0:
+        # early exit due to zero-size input(s)
+        return
+    stride_am = {{stride("A", 0)}}
+    stride_ak = {{stride("A", 1)}}
+    stride_bk = {{stride("B", 0)}}
+    stride_bn = {{stride("B", 1)}}
+
+    # based on triton.ops.matmul
+    pid = tl.program_id(0)
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(K, 0, -BLOCK_K):
+        if EVEN_K:
+            a = tl.load(A)
+            b = tl.load(B)
+        else:
+            a = tl.load(A, mask=rk[None, :] < k, other=0.)
+            b = tl.load(B, mask=rk[:, None] < k, other=0.)
+        if B_PROLOGUE_CAST_TYPE is not None:
+            b = b.to(B_PROLOGUE_CAST_TYPE)
+        if USE_FAST_ACCUM:
+            acc = tl.dot(a, b, acc, out_dtype=ACC_TYPE)
+        else:
+            acc += tl.dot(a, b, out_dtype=ACC_TYPE)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+
+    if SCALING_ROWWISE:
+        inv_a_scale_row = tl.load(A_inverse_scale + rm, mask=rm < M)
+        inv_b_scale_row = tl.load(B_inverse_scale + rn, mask=rn < N)
+        inv_scale_row = inv_a_scale_row[:, None] * inv_b_scale_row[None, :]
+        acc *= inv_scale_row
+    else:
+        # for tensor-wise scaling, the scales are scalars
+        inv_a_scale = tl.load(A_inverse_scale)
+        inv_b_scale = tl.load(B_inverse_scale)
+        inv_scale = inv_a_scale * inv_b_scale
+        acc *= inv_scale
+
+    # rematerialize rm and rn to save registers
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    idx_m = rm[:, None]
+    idx_n = rn[None, :]
+    mask = (idx_m < M) & (idx_n < N)
+
+    # inductor generates a suffix
+    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
+""",
+)
+
+
+# Inductor does not allow optional tensor input arguments currently (pass None as an
+# input node to template choices), but since for _scaled_mm there is only one such arg
+# (bias), work around by having a second template when bias is provided.
+scaled_mm_bias_template = TritonTemplate(
+    name="scaled_mm_bias",
+    grid=mm_grid,
+    source=r"""
+{{def_kernel("A", "B", "A_inverse_scale", "B_inverse_scale", "bias_ptr")}}
+    M = {{size("A", 0)}}
+    N = {{size("B", 1)}}
+    K = {{size("A", 1)}}
+    if M * N == 0:
+        # early exit due to zero-size input(s)
+        return
+    stride_am = {{stride("A", 0)}}
+    stride_ak = {{stride("A", 1)}}
+    stride_bk = {{stride("B", 0)}}
+    stride_bn = {{stride("B", 1)}}
+
+    # based on triton.ops.matmul
+    pid = tl.program_id(0)
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(K, 0, -BLOCK_K):
+        if EVEN_K:
+            a = tl.load(A)
+            b = tl.load(B)
+        else:
+            a = tl.load(A, mask=rk[None, :] < k, other=0.)
+            b = tl.load(B, mask=rk[:, None] < k, other=0.)
+        if B_PROLOGUE_CAST_TYPE is not None:
+            b = b.to(B_PROLOGUE_CAST_TYPE)
+        if USE_FAST_ACCUM:
+            acc = tl.dot(a, b, acc, out_dtype=ACC_TYPE)
+        else:
+            acc += tl.dot(a, b, out_dtype=ACC_TYPE)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+
+    if SCALING_ROWWISE:
+        inv_a_scale_row = tl.load(A_inverse_scale + rm, mask=rm < M)
+        inv_b_scale_row = tl.load(B_inverse_scale + rn, mask=rn < N)
+        inv_scale_row = inv_a_scale_row[:, None] * inv_b_scale_row[None, :]
+        acc *= inv_scale_row
+    else:
+        # for tensor-wise scaling, the scales are scalars
+        inv_a_scale = tl.load(A_inverse_scale)
+        inv_b_scale = tl.load(B_inverse_scale)
+        inv_scale = inv_a_scale * inv_b_scale
+        acc *= inv_scale
+
+    # rematerialize rm and rn to save registers
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    # bias
+    bias = tl.load(bias_ptr + rn, mask=rn < N)
+    acc += bias
+
+    idx_m = rm[:, None]
+    idx_n = rn[None, :]
+    mask = (idx_m < M) & (idx_n < N)
+
+    # inductor generates a suffix
+    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
+""",
+)
+
+
+aten__fp8_mm = ExternKernelChoice(torch._scaled_mm, "at::_scaled_mm")
+
+
+def are_compatible_scales(size_a: List[int], size_b: List[int]) -> bool:
+    # Same sized scales are compatable
+    if len(size_a) == len(size_b):
+        return True
+
+    # Both need to be scalars or len(1) tensors
+    if len(size_a) <= 1 and len(size_b) <= 1:
+        return True
+
+    return False
+
+
+def scaled_mm_options(  # type: ignore[no-untyped-def]
+    config,  # triton.Config
+    sym_m: sympy.core.numbers.Integer,
+    sym_n: sympy.core.numbers.Integer,
+    sym_k: sympy.core.numbers.Integer,
+    layout: Layout,
+    scale_a: StorageBox,
+    scale_b: StorageBox,
+    use_fast_accum: bool,
+    b_prologue_cast_type: Optional[str] = None,
+) -> Dict[str, Any]:
+    even_k_symbolic = (
+        sympy.gcd(sym_k, config.kwargs["BLOCK_K"]) == config.kwargs["BLOCK_K"]
+    )
+
+    size_a, size_b = scale_a.get_size(), scale_b.get_size()
+    assert are_compatible_scales(size_a, size_b), (
+        "Expect scale_a and scale_b to be either both scalars (including single-element tensors) "
+        f"or 1-dimensional tensors with the same size. Got scale_a: {len(size_a)} and scale_b: {len(size_b)}."
+    )
+    return dict(
+        GROUP_M=8,
+        EVEN_K=even_k_symbolic,
+        ACC_TYPE="tl.float32",
+        B_PROLOGUE_CAST_TYPE=b_prologue_cast_type,
+        USE_FAST_ACCUM=use_fast_accum,
+        num_stages=config.num_stages,
+        num_warps=config.num_warps,
+        # tensor-wise scaling if scalar scales
+        SCALING_ROWWISE=len(scale_a.get_size()) == 2,
+        **config.kwargs,
+    )
+
+
+add_layout_constraint(aten._scaled_mm.default, constrain_to_fx_strides)
+
+
+@register_lowering(aten._scaled_mm.default, type_promotion_kind=None)  # type: ignore[misc]
+def tuned_scaled_mm(
+    mat_a: TensorBox,
+    mat_b: TensorBox,
+    scale_a: TensorBox,
+    scale_b: TensorBox,
+    bias: Optional[TensorBox] = None,
+    scale_result: Optional[TensorBox] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    use_fast_accum: bool = False,
+    layout: Optional[Layout] = None,
+) -> TensorBox:
+    m, n, k, layout, mat_a, mat_b = mm_args(
+        mat_a, mat_b, layout=layout, out_dtype=out_dtype
+    )
+    scale_a, scale_b = realize_inputs(scale_a, scale_b)
+
+    input_nodes: Tuple[Any, ...]
+    # workaround for Inductor not supporting optional tensor input arguments
+    if bias is None:
+        input_nodes = (mat_a, mat_b, scale_a, scale_b)
+        triton_template = scaled_mm_template
+    else:
+        bias = realize_inputs(bias)
+        input_nodes = (mat_a, mat_b, scale_a, scale_b, bias)
+        triton_template = scaled_mm_bias_template
+
+    aten_choice = aten__fp8_mm.bind(
+        input_nodes, layout, out_dtype=out_dtype, use_fast_accum=use_fast_accum
+    )
+
+    choices: List[ChoiceCaller] = []
+    if use_aten_gemm_kernels():
+        choices.append(aten_choice)
+
+    static_shape, is_nonzero = _is_static_problem([mat_a, mat_b], layout)
+    if is_nonzero and use_triton_template(layout, enable_float8=True):
+        for config in scaled_mm_configs(m, n, k):
+            if k == 16 and config.kwargs["BLOCK_M"] >= 64:
+                continue  # Triton crashes in this case
+            kwargs = scaled_mm_options(
+                config, m, n, k, layout, scale_a, scale_b, use_fast_accum
+            )
+            # possibly appends a TritonTemplateCaller to choices
+            triton_template.maybe_append_choice(
+                choices,
+                input_nodes=input_nodes,
+                layout=layout,
+                **kwargs,
+            )
+
+    if (
+        len(choices) == 0
+        and not use_aten_gemm_kernels()
+        and inductor_config.autotune_fallback_to_aten
+    ):
+        log.warning("No choices for scaled_mm, using ATen backend as fallback")
+        return aten_choice.output_node()
+
+    try:
+        return autotune_select_algorithm("scaled_mm", choices, input_nodes, layout)
+    except NoValidChoicesError:
+        if not inductor_config.autotune_fallback_to_aten:
+            raise
+        log.warning(
+            "All choices for scaled_mm were invalid, using ATen backend as fallback"
+        )
+        return aten_choice.output_node()
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/unpack_mixed_mm.py b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/unpack_mixed_mm.py
new file mode 100644
index 0000000000000000000000000000000000000000..674da97c165546acf8397f57e52930e60a7c195d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/unpack_mixed_mm.py
@@ -0,0 +1,87 @@
+# mypy: allow-untyped-defs
+import logging
+from typing import List, TYPE_CHECKING
+
+from ..select_algorithm import autotune_select_algorithm, TritonTemplate
+from .mm_common import mm_args, mm_configs, mm_grid, mm_options
+
+
+if TYPE_CHECKING:
+    from ..ir import ChoiceCaller
+
+log = logging.getLogger(__name__)
+
+uint4x2_mixed_mm_template = TritonTemplate(
+    name="uint4x2_mixed_mm",
+    grid=mm_grid,
+    source=r"""
+{{def_kernel("A", "B")}}
+    M = {{size("A", 0)}}
+    N = {{size("B", 1)}}
+    K = {{size("A", 1)}}
+    stride_am = {{stride("A", 0)}}
+    stride_ak = {{stride("A", 1)}}
+    stride_bk = {{stride("B", 0)}}
+    stride_bn = {{stride("B", 1)}}
+
+    # based on triton.ops.matmul
+    pid = tl.program_id(0)
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None]//2 * stride_bk + rbn[None, :] * stride_bn)
+    b_shifts = 4*(rk%2)
+    b_subs = 8*(1-(rk%2))
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(K, 0, -BLOCK_K):
+        if EVEN_K:
+            a = tl.load(A)
+            b = tl.load(B)
+        else:
+            a = tl.load(A, mask=rk[None, :] < k, other=0.)
+            b = tl.load(B, mask=rk[:, None] < k, other=0.)
+        b = ((b >> b_shifts[:, None]) & 0xF) - 8
+        b = b.to(B_PROLOGUE_CAST_TYPE)
+        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K//2 * stride_bk
+
+    # rematerialize rm and rn to save registers
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    idx_m = rm[:, None]
+    idx_n = rn[None, :]
+    mask = (idx_m < M) & (idx_n < N)
+
+    # inductor generates a suffix
+    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
+""",
+)
+
+
+def tuned_uint4x2_mixed_mm(mat1, mat2, mat2_mm_shape, mat2_dtype):
+    m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=None, use_4x2_dim=True)
+    choices: List[ChoiceCaller] = []
+    b_prologue_cast_type = f"tl.{mat2_dtype}".replace("torch.", "")
+    for config in mm_configs(m, n, k):
+        uint4x2_mixed_mm_template.maybe_append_choice(
+            choices,
+            input_nodes=(mat1, mat2),
+            layout=layout,
+            **mm_options(config, m, n, k, layout, b_prologue_cast_type),
+        )
+    return autotune_select_algorithm("uint4x2_mixed_mm", choices, [mat1, mat2], layout)
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__init__.py b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2241377aeefe75227902da2c6c01c6589bd4cd39
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/autotune_cache.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/autotune_cache.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff2cb55adc68cf76d5cb07ea8702ecbfcf678286
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/autotune_cache.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/benchmarking.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/benchmarking.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb83530d1e742b489d74b58814e69bf998e96ace
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/benchmarking.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/compile_tasks.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/compile_tasks.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf012aa405a51756a414e8b2610a9260738ee55a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/compile_tasks.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/coordinate_descent_tuner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/coordinate_descent_tuner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a2befe423bd41386df420a2bdb5247e441a26f2b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/coordinate_descent_tuner.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/halide_helpers.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/halide_helpers.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03fe9fdf15b578e19d1e209739ec0510de2ec577
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/halide_helpers.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/hints.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/hints.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d500c5f757c4e72a55a12338de26e6cc4fec1acb
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/hints.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/runtime_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/runtime_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92697cbc01d7696d9b3e8cddfef6d4ed84456118
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/runtime_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/triton_helpers.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/triton_helpers.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f9eaa9ee60447b080aa2913381df9c54397a06e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/triton_helpers.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/triton_heuristics.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/triton_heuristics.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83d455ac3591b74faa7e7e5114f9e18d4b56cd9f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/triton_heuristics.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/autotune_cache.py b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/autotune_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..65dfc73d63d72d89f736ada79d7b9c7179e9dfb7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/autotune_cache.py
@@ -0,0 +1,237 @@
+from __future__ import annotations
+
+import dataclasses
+import hashlib
+import logging
+import os
+import os.path
+from typing import Dict, List, Optional, Tuple
+from typing_extensions import override
+
+import torch
+from torch.utils._triton import has_triton_package
+
+from ..remote_cache import (
+    JsonDataTy,
+    RemoteCache,
+    RemoteCacheBackend,
+    RemoteCacheJsonSerde,
+)
+
+
+if has_triton_package():
+    from triton import Config
+
+log = logging.getLogger(__name__)
+
+
+_InductorMetaTy = Dict[str, object]
+
+
+@dataclasses.dataclass
+class AutotuneCache:
+    configs_hash: str
+    filename: str
+    local_cache: Optional[Tuple[RemoteCache[JsonDataTy], str]] = None
+    remote_cache: Optional[Tuple[RemoteCache[JsonDataTy], str]] = None
+
+    # Create a AutotuneCache. Returns None if none of the caches can be used.
+    @staticmethod
+    def create(
+        inductor_meta: _InductorMetaTy, filename: str, configs_hash: str
+    ) -> Optional[AutotuneCache]:
+        cache = AutotuneCache(configs_hash, filename)
+        cache._setup_local_cache(inductor_meta, filename)
+        cache._setup_remote_autotune_cache(inductor_meta, filename)
+        if cache.local_cache or cache.remote_cache:
+            return cache
+        else:
+            return None
+
+    # Read the best config options from the most local cache and return it.
+    def _read(self, inductor_meta: _InductorMetaTy) -> Optional[Dict[str, JsonDataTy]]:
+        if local_cache := self.local_cache:
+            cache, key = local_cache
+            if best_config := cache.get(key):
+                if isinstance(best_config, dict):
+                    return best_config
+
+        if remote_cache := self.remote_cache:
+            cache, key = remote_cache
+            if best_config := cache.get(key):
+                if isinstance(best_config, dict):
+                    return best_config
+
+        return None
+
+    # Read the best config options from the most local cache and figure out
+    # which `configs` represents that option.
+    def read_best(
+        self, inductor_meta: _InductorMetaTy, configs: List[Config]
+    ) -> Optional[Config]:
+        if best := self._read(inductor_meta):
+            return _load_cached_autotuning(
+                best, self.configs_hash, configs, inductor_meta
+            )
+        return None
+
+    # Set up local filesystem caching information
+    def _setup_local_cache(self, inductor_meta: _InductorMetaTy, filename: str) -> None:
+        if not inductor_meta.get("autotune_local_cache", True):
+            return
+
+        cache_filename = os.path.splitext(filename)[0] + ".best_config"
+        local_cache = RemoteCache(_LocalAutotuneCacheBackend(), RemoteCacheJsonSerde())
+        self.local_cache = (local_cache, cache_filename)
+
+    # Set up remote caching information
+    def _setup_remote_autotune_cache(
+        self, inductor_meta: _InductorMetaTy, filename: str
+    ) -> None:
+        if not _should_use_remote_autotune_cache(inductor_meta):
+            return
+
+        remote_cache = _create_cache(
+            inductor_meta,
+            self.configs_hash,
+            "FbRemoteAutotuneCache",
+            "RemoteAutotuneCache",
+            "autotune-best-config-v2",
+        )
+        if not remote_cache:
+            return
+
+        # we already sha256 hash the source contents
+        remote_cache_key = os.path.basename(filename)
+        self.remote_cache = (remote_cache, remote_cache_key)
+
+    # Save the config in the caches
+    def save(
+        self, config: Config, time_taken_ns: int, found_by_coordesc: bool = False
+    ) -> None:
+        data = {
+            **config.kwargs,
+            "num_warps": config.num_warps,
+            "num_stages": config.num_stages,
+            "configs_hash": self.configs_hash,
+            "found_by_coordesc": found_by_coordesc,
+            "time_taken_ms": time_taken_ns // 1000000,  # Convert from NS to MS
+        }
+
+        if local_cache := self.local_cache:
+            cache, key = local_cache
+            cache.put(key, data)
+
+            if log.isEnabledFor(logging.DEBUG):
+                type_str = "coordesc" if found_by_coordesc else "heuristic"
+                log.debug("Save %s tuning result to %s", type_str, key)
+
+        if remote_cache := self.remote_cache:
+            cache, key = remote_cache
+            cache.put(key, data)
+
+
+def _should_use_remote_autotune_cache(inductor_meta: Dict[str, object]) -> bool:
+    if (config := inductor_meta.get("autotune_remote_cache")) is not None:
+        return bool(config)
+    if not inductor_meta.get("is_fbcode"):
+        return False
+    if torch._utils_internal.is_fb_unit_test():
+        return False
+    if inductor_meta.get("is_hip"):
+        return False
+
+    try:
+        from torch._inductor.fb.remote_cache import REMOTE_CACHE_VERSION
+    except ModuleNotFoundError:
+        return False
+
+    return REMOTE_CACHE_VERSION >= torch._utils_internal.justknobs_getval_int(
+        "pytorch/remote_cache:autotune_memcache_version"
+    )
+
+
+def _load_cached_autotuning(
+    best_config: Dict[str, JsonDataTy],
+    configs_hash: str,
+    configs: List[Config],
+    inductor_meta: Dict[str, object],
+) -> Optional[Config]:
+    if best_config is None:
+        return None
+    if best_config.pop("configs_hash", None) != configs_hash:
+        return None
+
+    # Remove time taken for comparison
+    best_config.pop("time_taken_ms", None)
+
+    if inductor_meta.get("coordinate_descent_tuning") and best_config.pop(
+        "found_by_coordesc", False
+    ):
+        num_warps = best_config.pop("num_warps")
+        num_stages = best_config.pop("num_stages")
+        triton_config = Config(best_config, num_warps=num_warps, num_stages=num_stages)
+        triton_config.found_by_coordesc = True
+        return triton_config
+
+    matching_configs = [
+        cfg
+        for cfg in configs
+        if all(val == best_config.get(key) for key, val in cfg.kwargs.items())
+        and cfg.num_warps == best_config.get("num_warps")
+        and cfg.num_stages == best_config.get("num_stages")
+    ]
+    if len(matching_configs) != 1:
+        return None
+
+    return matching_configs[0]
+
+
+def _create_cache(
+    inductor_meta: Dict[str, object],
+    configs_hash: str,
+    fb_cache_cls: str,
+    oss_cache_cls: str,
+    salt: str,
+) -> Optional[RemoteCache[JsonDataTy]]:
+    backend_hash = inductor_meta.get("backend_hash", None)
+    if backend_hash is None:
+        log.debug(
+            "backend_hash is not passed on the inductor_meta, unable to use autotune remote cache"
+        )
+        return None
+
+    assert isinstance(backend_hash, str)
+
+    key = backend_hash + configs_hash + salt
+    key = hashlib.sha256(key.encode("utf-8")).hexdigest()
+
+    try:
+        if inductor_meta.get("is_fbcode"):
+            import torch._inductor.fb.remote_cache
+
+            cache_cls = getattr(torch._inductor.fb.remote_cache, fb_cache_cls)
+            return cache_cls(key)
+        else:
+            import torch._inductor.remote_cache
+
+            cache_cls = getattr(torch._inductor.remote_cache, oss_cache_cls)
+            return cache_cls(key)
+    except Exception:
+        log.warning("Unable to create a remote cache", exc_info=True)
+        return None
+
+
+class _LocalAutotuneCacheBackend(RemoteCacheBackend[bytes]):
+    @override
+    def get(self, key: str) -> Optional[bytes]:
+        try:
+            with open(key, "rb") as fd:
+                return fd.read()
+        except FileNotFoundError:
+            return None
+
+    @override
+    def put(self, key: str, data: bytes) -> None:
+        with open(key, "wb") as fd:
+            fd.write(data)
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/benchmarking.py b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/benchmarking.py
new file mode 100644
index 0000000000000000000000000000000000000000..44c222c3f936eff7b1f2ddc5a8953c5dd1e82539
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/benchmarking.py
@@ -0,0 +1,204 @@
+import time
+from functools import cached_property, wraps
+from itertools import chain
+from statistics import median
+from typing import Any, Callable, Dict, List, Tuple
+from typing_extensions import Concatenate, ParamSpec, Self, TypeVar
+
+import torch
+from torch._dynamo.utils import counters
+
+
+logger = torch._logging.getArtifactLogger(__name__, "benchmarking")
+
+
+MILLISECONDS_PER_SECOND = 1000
+
+P = ParamSpec("P")
+T = TypeVar("T")
+
+
+def maybe_time(
+    fn: Callable[Concatenate[Any, P], T]
+) -> Callable[Concatenate[Any, P], T]:
+    """Wrapper that logs the duration of `fn`, in milliseconds, along with a representation
+    of the function's args and kwargs, if logging is enabled. It is expected that `fn` is
+    a method of `Benchmarker` or one of its subclasses; typing limitations prevent us from
+    declaring this directly. If logging is disabled, this becomes a no-op.
+    """
+
+    # no-op if benchmarking-specific logging is disabled
+    if not torch._logging._internal.log_state.is_artifact_enabled("benchmarking"):
+        return fn
+
+    @wraps(fn)
+    def wrapper(self: Any, *args: P.args, **kwargs: P.kwargs) -> T:
+        start_t = time.perf_counter()
+        result = fn(*args, **kwargs)
+        logger.debug(
+            "Call `benchmarking.%s.%s(*args=%r, **kwargs=%r)` took %f milliseconds.",
+            self.__class__.__name__,
+            fn.__name__,
+            args,
+            kwargs,
+            (time.perf_counter() - start_t) * MILLISECONDS_PER_SECOND,
+        )
+        return result
+
+    return wrapper
+
+
+def count(fn: Callable[Concatenate[Any, P], T]) -> Callable[Concatenate[Any, P], T]:
+    """Wrapper that increments relevant dynamo counters on `fn` call. It is expected that
+    `fn` is a method of `Benchmarker` or one of its subclass; typing limitations prevent
+    us from declaring this directly. The counter incrementation follows the formula,
+
+    `counters["inductor"]["benchmarking.Foo.bar] += 1`
+
+    where `Foo` is the class whose' instance called the function, and `bar` is the function name.
+    """
+
+    @wraps(fn)
+    def wrapper(self: Any, *args: P.args, **kwargs: P.kwargs) -> T:
+        counters["inductor"][
+            "benchmarking." + self.__class__.__name__ + "." + fn.__name__
+        ] += 1
+        return fn(self, *args, **kwargs)
+
+    return wrapper
+
+
+class Benchmarker:
+    def __init__(self: Self) -> None:
+        pass
+
+    @maybe_time
+    @count
+    def benchmark(
+        self: Self,
+        fn: Callable[..., Any],
+        fn_args: Tuple[Any],
+        fn_kwargs: Dict[str, Any],
+        **kwargs: Any,
+    ) -> float:
+        """Benchmark `fn(*fn_args, *fn_kwargs)` and return the runtime, in milliseconds (the
+        actual runtime calculation is dictated by the benchmarking implementation, but may be
+        one of [mean, median, minimum, etc.]). Functions as a convenience wrapper around
+        device-specific implementations, like `benchmark_cpu` and `benchmark_gpu`. Raises
+        `ValueError(...)` if we can't safely infer the device type of `fn`; for example,
+        if multiple device types are found in `fn_args` and `fn_kwargs`, or if no device
+        types are found.
+
+        Arguments:
+        - fn: The function to benchmark.
+        - fn_args: The function's arguments.
+        - fn_kwargs: The function's kwargs.
+
+        Keyword Arguments:
+        - **kwargs: The benchmarking implementation's kwargs.
+
+        Returns:
+        - The runtime of `fn(*fn_args, **fn_kwargs)`, in milliseconds.
+        """
+        inferred_device = None
+        for arg_or_kwarg in chain(fn_args, fn_kwargs.values()):
+            if not isinstance(arg_or_kwarg, torch.Tensor):
+                continue
+            if inferred_device is None:
+                inferred_device = arg_or_kwarg.device
+            elif arg_or_kwarg.device != inferred_device:
+                raise ValueError(
+                    "Can't safely infer the device type of `fn` with multiple device types in `fn_args` and `fn_kwargs`!"
+                )
+        if inferred_device is None:
+            raise ValueError(
+                "Can't safely infer the device type of `fn` with no device types in `fn_args` or `fn_kwargs`! You should be calling `.benchmark_cpu` or `.benchmark_gpu` directly."  # noqa: B950
+            )
+        _callable = lambda: fn(*fn_args, **fn_kwargs)  # noqa: E731
+        if inferred_device == torch.device("cpu"):
+            return self.benchmark_cpu(_callable, **kwargs)
+        # TODO(nmacchioni): For non-CPU functions we default to using the GPU-specific benchmarking
+        # implementation which was written specifically with CUDA devices in mind, we may want to
+        # explore alternate implementations for other device types.
+        return self.benchmark_gpu(_callable, **kwargs)
+
+    @maybe_time
+    @count
+    def benchmark_cpu(
+        self: Self, _callable: Callable[[], Any], warmup: int = 20, rep: int = 100
+    ) -> float:
+        """Benchmark the CPU callable, `_callable`, and return the median runtime,
+        in milliseconds.
+
+        Arguments:
+        - _callable: The CPU callable to benchmark.
+
+        Keyword Arguments:
+        - warmup: Optionally, the duration, in milliseconds, to run `_callable`
+        before benchmarking starts.
+        - rep: Optionally, the duration, in milliseconds, to run `_callable`
+        during benchmarking.
+
+        Returns:
+        - The median runtime of `_callable`, in milliseconds.
+        """
+
+        def run_for(ms: int) -> List[float]:
+            timings = []
+            run_start_t = time.perf_counter()
+            while True:
+                start_t = time.perf_counter()
+                _callable()
+                end_t = time.perf_counter()
+                timings.append((end_t - start_t) * MILLISECONDS_PER_SECOND)
+                if ((end_t - run_start_t) * MILLISECONDS_PER_SECOND) > ms:
+                    break
+            return timings
+
+        run_for(warmup)
+        return median(run_for(rep))
+
+    @count
+    def benchmark_gpu(self: Self, *args: Any, **kwargs: Any) -> float:
+        raise NotImplementedError
+
+
+class TritonBenchmarker(Benchmarker):
+    @cached_property
+    @maybe_time
+    @count
+    def triton_do_bench(self: Self) -> Callable[..., Any]:
+        """Lazily import Triton's `do_bench`."""
+        try:
+            from triton.testing import do_bench
+        except ImportError as e:
+            raise NotImplementedError("requires Triton") from e
+        return do_bench
+
+    @maybe_time
+    @count
+    def benchmark_gpu(self: Self, _callable: Callable[[], Any], **kwargs: Any) -> float:
+        """Benchmark the GPU callable, `_callable`, and return the runtime, in milliseconds.
+
+        Arguments:
+        - _callable: The GPU callable to benchmark.
+
+        Keyword Arguments:
+        - quantiles: Optionally, a tuple of floats denoting the requested quantiles.
+        - return_mode: Optionally, the requested return mode. Currently, Triton's
+        `do_bench` supports min, max, mean, and median return modes.
+        - **kwargs: Additional kwargs passed to Triton's `do_bench`.
+
+        Returns:
+        - The runtime of `callable`, in milliseconds. If `kwargs["quantiles"]` is specified,
+        this is the first requested quantile. Else, if `kwargs["return_mode"]` is specified,
+        this is the requested return mode. Otherwise, this is the median.
+        """
+        if "quantiles" in kwargs:
+            return self.triton_do_bench(_callable, **kwargs)[0]
+        elif "return_mode" in kwargs:
+            return self.triton_do_bench(_callable, **kwargs)
+        return self.triton_do_bench(_callable, **kwargs, return_mode="median")
+
+
+benchmarker = TritonBenchmarker()
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/compile_tasks.py b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/compile_tasks.py
new file mode 100644
index 0000000000000000000000000000000000000000..17788ab7920fb0c23c03ddb8caaa805a82a03785
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/compile_tasks.py
@@ -0,0 +1,68 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+import functools
+import os
+import sys
+import warnings
+from types import ModuleType
+from typing import Any, Callable, Dict
+
+
+def _reload_triton_kernel_in_subproc(reload_module, kernel_name):
+    return _module_to_triton_kernel(reload_module(), kernel_name)
+
+
+def _module_to_triton_kernel(mod, kernel_name):
+    kernel = getattr(mod, kernel_name)
+    kernel._reload_in_subproc = functools.partial(
+        _reload_triton_kernel_in_subproc,
+        mod._reload_in_subproc,
+        kernel_name,
+    )
+    return kernel
+
+
+def _reload_python_module_in_subproc(key, path):
+    codecache = sys.modules.get("torch._inductor.codecache")
+    if codecache:
+        return codecache.PyCodeCache.load_by_key_path(key, path)
+    else:
+        return _reload_python_module(key, path)
+
+
+def _reload_python_module(key, path):
+    with open(path) as f:
+        try:
+            code = compile(f.read(), path, "exec", dont_inherit=True)
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to import {path}\n{type(e).__name__}: {e}"
+            ) from None
+        mod = ModuleType(f"{__name__}.{key}")
+        mod.__file__ = path
+        mod.key = key  # type: ignore[attr-defined]
+        exec(code, mod.__dict__, mod.__dict__)
+        sys.modules[mod.__name__] = mod
+        return mod
+
+
+@functools.lru_cache(None)
+def _set_triton_ptxas_path() -> None:
+    if os.environ.get("TRITON_PTXAS_PATH") is not None:
+        return
+    ptxas_path = os.path.abspath(
+        os.path.join(os.path.dirname(__file__), "..", "bin", "ptxas")
+    )
+    if not os.path.exists(ptxas_path):
+        return
+    if os.path.isfile(ptxas_path) and os.access(ptxas_path, os.X_OK):
+        os.environ["TRITON_PTXAS_PATH"] = ptxas_path
+    else:
+        warnings.warn(f"{ptxas_path} exists but is not an executable")
+
+
+def _worker_compile_triton(load_kernel: Callable[[], Any], extra_env: Dict[str, str]):
+    _set_triton_ptxas_path()
+    os.environ.update(extra_env)
+    load_kernel().precompile(warm_cache_only=True)
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/coordinate_descent_tuner.py b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/coordinate_descent_tuner.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c2af613a04cac07258eddb6abd2248c89936ed7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/coordinate_descent_tuner.py
@@ -0,0 +1,304 @@
+# mypy: allow-untyped-defs
+import copy
+import itertools
+import logging
+from typing import Callable, Optional
+
+from .hints import TRITON_MAX_BLOCK
+from .runtime_utils import red_text, triton_config_to_hashable
+
+
+try:
+    import triton
+except ImportError:
+    triton = None
+
+log = logging.getLogger(__name__)
+
+
+def get_field(config, name):
+    if name == "num_warps":
+        return config.num_warps
+    elif name == "num_stages":
+        return config.num_stages
+    else:
+        return config.kwargs.get(name, None)
+
+
+def set_field(config, name, value):
+    if name == "num_warps":
+        config.num_warps = value
+    elif name == "num_stages":
+        config.num_stages = value
+    else:
+        config.kwargs[name] = value
+
+
+class CoordescTuner:
+    """
+    The coordinate descent tuner. Tune one field/coordinate at a time.
+
+    TODO will it be necessary to tune multiple fields simultaneously.
+
+
+    TODO: what if both increasing and decreasing a field can improve perf.
+          i.e., there are multiple local optima..
+    """
+
+    def __init__(
+        self, is_mm=False, name="unknown", size_hints=None, inductor_meta=None
+    ):
+        self.is_mm = is_mm  # we will tune num_stages for mm
+        self.cached_benchmark_results = {}
+        self.name = name
+        self.size_hints = size_hints
+        self.inductor_meta = inductor_meta or {}
+
+    def prefix_to_size_hint(self, prefix: str) -> Optional[int]:
+        size_hint_idx = {"X": 0, "Y": 1, "Z": 2, "R": -1}[prefix]
+
+        have_size_hint = (
+            self.size_hints is not None
+            and len(self.size_hints) > 0
+            and len(self.size_hints) > size_hint_idx
+        )
+        return self.size_hints[size_hint_idx] if have_size_hint else None
+
+    def get_config_max(self, prefix: str) -> int:
+        max_block = TRITON_MAX_BLOCK[prefix]
+        size_hint = self.prefix_to_size_hint(prefix)
+        return min(max_block, size_hint) if size_hint is not None else max_block
+
+    def get_warpsmax(self):
+        # Currently, CUDA has a maximum of 1024 threads, so 32 is the max
+        # number of warps.
+        return 1024 // 32
+
+    def cache_benchmark_result(self, config, timing):
+        self.cached_benchmark_results[triton_config_to_hashable(config)] = timing
+
+    def lookup_in_cache(self, config):
+        return self.cached_benchmark_results.get(triton_config_to_hashable(config))
+
+    def call_func(self, func, config):
+        found = self.lookup_in_cache(config)
+        if found is not None:
+            log.debug("  CACHED")
+            return found
+        timing = func(config)
+        self.cache_benchmark_result(config, timing)
+        return timing
+
+    @property
+    def tunable_fields(self):
+        out = [
+            "XBLOCK",
+            "YBLOCK",
+            "ZBLOCK",
+            # NOTE: we should not tune RBLOCK for persistent reduction.
+            # We rely on the fact that persistent reduction's triton.Config
+            # does not have the RBLOCK field to guarantee that.
+            "RBLOCK",
+            # the following 3 are for mm
+            "BLOCK_M",
+            "BLOCK_N",
+            "BLOCK_K",
+            "num_warps",
+        ]
+        if self.is_mm:
+            out.append("num_stages")
+
+        return out
+
+    def value_too_large(self, name: str, val: int) -> bool:
+        if name in {"XBLOCK", "YBLOCK", "ZBLOCK", "RBLOCK"}:
+            return val > self.get_config_max(name[0])
+        if name == "num_warps":
+            return val > self.get_warpsmax()
+
+        return False
+
+    def get_neighbour_values(self, name, orig_val, radius=1, include_self=False):
+        """
+        Get neighbour values in 'radius' steps. The original value is not
+        returned as it's own neighbour.
+        """
+        assert radius >= 1
+
+        def update(cur_val, inc=True):
+            if name == "num_stages":
+                if inc:
+                    return cur_val + 1
+                else:
+                    return cur_val - 1
+            else:
+                if inc:
+                    return cur_val * 2
+                else:
+                    return cur_val // 2
+
+        out = []
+        # increment loop
+        cur_val = orig_val
+        for _ in range(radius):
+            cur_val = update(cur_val, True)
+            if self.value_too_large(name, cur_val):
+                break
+            out.append(cur_val)
+
+        # decrement loop
+        cur_val = orig_val
+        for _ in range(radius):
+            cur_val = update(cur_val, False)
+            if cur_val <= 0:
+                break
+            out.append(cur_val)
+
+        if include_self:
+            out.append(orig_val)
+        return out
+
+    @staticmethod
+    def has_improvement(baseline, test):
+        threshold = 0.001  # 0.1%
+        return test is not None and test < baseline * (1 - threshold)
+
+    def check_all_tuning_directions(
+        self,
+        func: Callable[["triton.Config"], float],
+        best_config,
+        best_timing,
+    ):
+        """
+        Check all directions. We only do this once the regular coordinate
+        descent tuning find no better choices any more.
+        We only have a few tunable fields, so this should be fine.
+        """
+        candidate_values_list = []
+        effective_fields = []
+        for field in self.tunable_fields:
+            old_value = get_field(best_config, field)
+            if old_value is None:
+                continue
+            candidate_values = self.get_neighbour_values(
+                field,
+                old_value,
+                radius=self.inductor_meta.get("coordinate_descent_search_radius", 1),
+                include_self=True,
+            )
+            candidate_values_list.append(candidate_values)
+            effective_fields.append(field)
+
+        choices = itertools.product(*candidate_values_list)
+        improved = False
+        for choice in choices:
+            assert len(choice) == len(effective_fields)
+            candidate_config = copy.deepcopy(best_config)
+            for new_val, field in zip(choice, effective_fields):
+                set_field(candidate_config, field, new_val)
+            cmp_res, candidate_timing = self.compare_config(
+                func, candidate_config, best_config, best_timing
+            )
+            if cmp_res:
+                improved = True
+                best_config = candidate_config
+                best_timing = candidate_timing
+
+        return improved, best_config, best_timing
+
+    def compare_config(self, func, candidate_config, best_config, best_timing):
+        """
+        Check if candidate_config is better than best_config.
+
+        Return a touple of (compare_result, candidate_timing).
+        compare_result is true iff candidate_config is better.
+        """
+        log.debug("Try config %s", candidate_config)
+        try:
+            candidate_timing = self.call_func(func, candidate_config)
+        except Exception as e:
+            log.debug("Got exception %s", e)
+            return False, float("inf")
+
+        if self.has_improvement(best_timing, candidate_timing):
+            log.debug(
+                "Tune from %s %f -> %s %f",
+                best_config,
+                best_timing,
+                candidate_config,
+                candidate_timing,
+            )
+
+            return True, candidate_timing
+        return False, candidate_timing
+
+    def autotune(
+        self,
+        func: Callable[["triton.Config"], float],
+        baseline_config: "triton.Config",
+        baseline_timing: Optional[float] = None,
+    ) -> "triton.Config":
+        if baseline_timing is None:
+            baseline_timing = self.call_func(func, baseline_config)
+
+        log.debug("= Do coordinate descent tuning for %s =", self.name)
+        log.debug(
+            "Baseline Config %s, baseline timing %f", baseline_config, baseline_timing
+        )
+        improved = True
+        best_config = baseline_config
+        best_timing = baseline_timing
+        tunable_fields = self.tunable_fields
+
+        while improved:
+            improved = False
+
+            for name in tunable_fields:
+                cur_val = get_field(best_config, name)
+                # some kernel don't have RBLOCK/YBLOCK/ZBLOCK. So cur_val may be None
+                if cur_val is None:
+                    continue
+
+                # It's possible that candidate_values is empty.
+                # E.g., if XBLOCK is 1 initially and size_hint for x is also 1.
+                # We would not try either larger or smaller XBLOCK in this case.
+                candidate_values = self.get_neighbour_values(name, cur_val)
+
+                for next_val in candidate_values:
+                    candidate_config = copy.deepcopy(best_config)
+                    set_field(candidate_config, name, next_val)
+
+                    cmp_res, candidate_timing = self.compare_config(
+                        func, candidate_config, best_config, best_timing
+                    )
+                    if cmp_res:
+                        improved = True
+                        best_config, best_timing = candidate_config, candidate_timing
+
+            if not improved and self.inductor_meta.get(
+                "coordinate_descent_check_all_directions"
+            ):
+                old_best_timing = best_timing
+                improved, best_config, best_timing = self.check_all_tuning_directions(
+                    func, best_config, best_timing
+                )
+
+                if improved:
+                    msg = red_text(
+                        "Coordinate descend tuning found improvement of %.3fx by looking in all directions."
+                    )
+                    log.debug(
+                        msg,
+                        old_best_timing / best_timing,
+                    )
+
+        log.debug(
+            "Improve from %s %f -> %s %f, %.3fx",
+            baseline_config,
+            baseline_timing,
+            best_config,
+            best_timing,
+            baseline_timing / best_timing,
+        )
+
+        return best_config
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/halide_helpers.py b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/halide_helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..db813b0c051a19706937ff05e838aea4f45619f3
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/halide_helpers.py
@@ -0,0 +1,118 @@
+# mypy: allow-untyped-defs
+try:
+    import halide as hl  # type: ignore[import-untyped, import-not-found]
+except ImportError:
+    hl = None
+
+PHILOX_N_ROUNDS_DEFAULT = 10  # Default number of rounds for philox
+
+if hl is not None:
+    PHILOX_KEY_A_U32 = hl.u32(0x9E3779B9)
+    PHILOX_KEY_B_U32 = hl.u32(0xBB67AE85)
+    PHILOX_ROUND_A_U32 = hl.u32(0xD2511F53)
+    PHILOX_ROUND_B_U32 = hl.u32(0xCD9E8D57)
+else:
+    PHILOX_KEY_A_U32 = None
+    PHILOX_KEY_B_U32 = None
+    PHILOX_ROUND_A_U32 = None
+    PHILOX_ROUND_B_U32 = None
+
+
+def _pair_uniform_to_normal(u1, u2):
+    """Box-Muller transform"""
+    u1 = hl.max(hl.f32(1.0e-7), u1)
+    th = hl.f32(6.283185307179586) * u2
+    r = hl.sqrt(hl.f32(-2.0) * hl.log(u1))
+    return r * hl.cos(th), r * hl.sin(th)
+
+
+def _uint_to_uniform_float(x):
+    """
+    Numerically stable function to convert a random uint into a random float uniformly sampled in [0, 1).
+    """
+
+    # TODO:
+    # conditions can be simplified
+    # scale is ((2**23 - 1) / 2**23) * 2**(N_BITS - 1)
+    # https://github.com/triton-lang/triton/blob/e4a0d93ff1a367c7d4eeebbcd7079ed267e6b06f/python/triton/language/random.py#L116-L132.
+    assert x.type() == hl.UInt(32) or x.type() == hl.Int(32)
+    x = hl.cast(hl.Int(32), x)
+    scale = hl.f64(4.6566127342e-10)
+    x = hl.select(x < 0, -x - 1, x)
+    return x * scale
+
+
+def philox_impl(c0, c1, c2, c3, k0, k1, n_rounds):
+    def umulhi(a, b):
+        a = hl.cast(hl.UInt(64), a)
+        b = hl.cast(hl.UInt(64), b)
+        return hl.cast(hl.UInt(32), ((a * b) >> 32) & hl.u64(0xFFFFFFFF))
+
+    for _ in range(n_rounds):
+        _c0, _c2 = c0, c2
+
+        c0 = umulhi(PHILOX_ROUND_B_U32, _c2) ^ c1 ^ k0
+        c2 = umulhi(PHILOX_ROUND_A_U32, _c0) ^ c3 ^ k1
+        c1 = PHILOX_ROUND_B_U32 * _c2
+        c3 = PHILOX_ROUND_A_U32 * _c0
+        # raise key
+        k0 = k0 + PHILOX_KEY_A_U32
+        k1 = k1 + PHILOX_KEY_B_U32
+
+    return c0, c1, c2, c3
+
+
+def halide_philox(seed, c0, c1, c2, c3, n_rounds):
+    seed = hl.cast(hl.UInt(64), seed)
+
+    assert c0.type().bits() == 32
+
+    seed_hi = hl.cast(hl.UInt(32), (seed >> 32) & hl.u64(0xFFFFFFFF))
+    seed_lo = hl.cast(hl.UInt(32), seed & hl.u64(0xFFFFFFFF))
+
+    return philox_impl(c0, c1, c2, c3, seed_lo, seed_hi, n_rounds)
+
+
+def randint4x(seed, offset, n_rounds):
+    offset = hl.cast(hl.UInt(32), offset)
+    _0 = hl.u32(0)
+    return halide_philox(seed, offset, _0, _0, _0, n_rounds)
+
+
+def rand4x(seed, offset, n_rounds=PHILOX_N_ROUNDS_DEFAULT):
+    i1, i2, i3, i4 = randint4x(seed, offset, n_rounds)
+    u1 = _uint_to_uniform_float(i1)
+    u2 = _uint_to_uniform_float(i2)
+    u3 = _uint_to_uniform_float(i3)
+    u4 = _uint_to_uniform_float(i4)
+    return u1, u2, u3, u4
+
+
+def randint(seed, offset, n_rounds=PHILOX_N_ROUNDS_DEFAULT):
+    ret, _, _, _ = randint4x(seed, offset, n_rounds)
+    return ret
+
+
+def rand(seed, offset, n_rounds=PHILOX_N_ROUNDS_DEFAULT):
+    source = randint(seed, offset, n_rounds)
+    return _uint_to_uniform_float(source)
+
+
+def randn(seed, offset):
+    i1, i2, _, _ = randint4x(seed, offset, PHILOX_N_ROUNDS_DEFAULT)
+    u1 = _uint_to_uniform_float(i1)
+    u2 = _uint_to_uniform_float(i2)
+    n1, _ = _pair_uniform_to_normal(u1, u2)
+    return n1
+
+
+def randint64(seed, offset, low, high):
+    r0, r1, r2, r3 = randint4x(seed, offset, PHILOX_N_ROUNDS_DEFAULT)
+    r0 = hl.cast(hl.UInt(64), r0)
+    r1 = hl.cast(hl.UInt(64), r1)
+
+    result = r0 | (r1 << 32)
+    size = high - low
+    result = result % hl.cast(hl.UInt(64), size)
+    result = hl.cast(hl.Int(64), result) + low
+    return result
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/hints.py b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/hints.py
new file mode 100644
index 0000000000000000000000000000000000000000..90f9e5a1cc5966bf0416c947cf63ea29a281113d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/hints.py
@@ -0,0 +1,179 @@
+# mypy: allow-untyped-defs
+import collections
+import typing
+from dataclasses import fields
+from enum import auto, Enum
+from typing import Dict, List, Optional, Union
+
+
+# NOTE: if these fail asserts submit a PR to increase them
+TRITON_MAX_BLOCK = {
+    "X": 2048,
+    "Y": 1024,
+    "Z": 1024,
+    "R": 4096 * 16,  # * 16 is multi-kernel only
+}
+
+
+class ReductionHint(Enum):
+    INNER = 0
+    OUTER = 1
+    OUTER_TINY = 2
+    DEFAULT = 3
+
+
+class TileHint(Enum):
+    SQUARE = 0
+    DEFAULT = 1
+
+
+# Attempt to import AttrsDescriptor from Triton
+try:
+    from triton.compiler.compiler import AttrsDescriptor
+
+    attrs_descriptor_available = True
+    # Determine if 'ids_of_folded_args' is a valid field for AttrsDescriptor
+    attr_desc_fields = {f.name for f in fields(AttrsDescriptor)}
+    ids_of_folded_args_available = "ids_of_folded_args" in attr_desc_fields
+    divisible_by_8_available = "divisible_by_8" in attr_desc_fields
+except ImportError:
+    attrs_descriptor_available = False
+
+# Define `instance_descriptor` function with clear conditional handling
+if attrs_descriptor_available:
+
+    def instance_descriptor(
+        divisible_by_16=None,
+        equal_to_1=None,
+        ids_of_folded_args=None,
+        divisible_by_8=None,
+    ):
+        # Prepare the arguments for AttrsDescriptor
+        kwargs = {
+            "divisible_by_16": divisible_by_16,
+            "equal_to_1": equal_to_1,
+        }
+
+        # Conditionally add 'ids_of_folded_args' if it's available in AttrsDescriptor
+        if ids_of_folded_args_available:
+            kwargs["ids_of_folded_args"] = ids_of_folded_args
+        if divisible_by_8_available:
+            kwargs["divisible_by_8"] = divisible_by_8
+
+        # Instantiate AttrsDescriptor with the prepared arguments
+        return AttrsDescriptor(**kwargs)
+
+else:
+    # Define a namedtuple as a fallback when AttrsDescriptor is not available
+    instance_descriptor = collections.namedtuple(  # type: ignore[no-redef]
+        "instance_descriptor",
+        ["divisible_by_16", "equal_to_1", "ids_of_folded_args", "divisible_by_8"],
+        defaults=[(), (), (), ()],
+    )
+
+
+_NUM_THREADS_PER_WARP = 32
+
+
+class HeuristicType(Enum):
+    PERSISTENT_REDUCTION = auto()
+    POINTWISE = auto()
+    REDUCTION = auto()
+    SPLIT_SCAN = auto()
+    TEMPLATE = auto()
+    USER_AUTOTUNE = auto()
+
+
+class AutotuneHint(Enum):
+    ELEMENTS_PER_WARP_32 = 0
+
+    # Triton codegen tries to codegen set of AutotuneHints.
+    # Enum.__repr__ looks like "<AutotuneHint.ELEMENTS_PER_WARP_32: 0>""
+    # which isn't valid python.
+    # Enum.__str__ will just return "AutotuneHint.ELEMENTS_PER_WARP_32".
+    __repr__ = Enum.__str__
+
+
+class DeviceProperties(typing.NamedTuple):
+    """Copy device properties into a data structure not requiring torch to be imported"""
+
+    type: str  # type: ignore[assignment]
+    index: int  # type: ignore[assignment]
+    cc: int
+    major: Optional[int] = None
+    regs_per_multiprocessor: Optional[int] = None
+    max_threads_per_multi_processor: Optional[int] = None
+    multi_processor_count: Optional[int] = None
+
+    @classmethod
+    def create(cls, device):
+        import torch
+        from torch._dynamo.device_interface import get_interface_for_device
+
+        device_type = device.type if torch.version.hip is None else "hip"
+        device_interface = get_interface_for_device(device)
+        if device_type == "cuda":
+            props = device_interface.get_device_properties(device)
+            return cls(
+                type=device_type,
+                index=device.index,
+                cc=device_interface.get_compute_capability(device),
+                major=props.major,
+                regs_per_multiprocessor=props.regs_per_multiprocessor,
+                max_threads_per_multi_processor=props.max_threads_per_multi_processor,
+                multi_processor_count=props.multi_processor_count,
+            )
+        return cls(
+            type=device_type,
+            index=device.index,
+            cc=device_interface.get_compute_capability(device),
+        )
+
+
+class HalideInputSpec(typing.NamedTuple):
+    ctype: str
+    name: str
+    shape: Optional[List[str]] = None
+    stride: Optional[List[str]] = None
+    offset: Optional[str] = None
+    alias_of: Optional[str] = None
+
+    def bindings_type(self):
+        if self.ctype in ("half*", "bfloat16*"):
+            return "uint16_t*"  # half not defined
+        return self.ctype
+
+    def halide_type(self):
+        if self.ctype == "half*":
+            return "halide_type_t(halide_type_float, 16)"  # half not defined
+        if self.ctype == "bfloat16*":
+            return "halide_type_t(halide_type_bfloat, 16)"  # half not defined
+        return f"halide_type_of<{self.ctype.replace('*', '')}>()"
+
+    def is_scalar(self):
+        return self.shape is None
+
+    def is_buffer(self):
+        return self.shape is not None
+
+
+class HalideMeta(typing.NamedTuple):
+    argtypes: List[HalideInputSpec]
+    target: str
+    scheduler: Optional[str] = None
+    scheduler_flags: Optional[Dict[str, Union[int, str]]] = None
+    cuda_device: Optional[int] = None
+
+    def args(self):
+        """Command line args to pass to halide generator"""
+        args = [f"target={self.target}"]
+        if self.scheduler:
+            args.append(f"autoscheduler={self.scheduler}")
+        if self.scheduler_flags:
+            assert self.scheduler
+            for k, v in self.scheduler_flags.items():
+                args.append(f"autoscheduler.{k}={v}")
+        return args
+
+    def is_cuda(self):
+        return self.cuda_device is not None
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/runtime_utils.py b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/runtime_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..446dbc71c61d1d7f5f7892b1a0c748974d539caf
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/runtime_utils.py
@@ -0,0 +1,154 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+import contextlib
+import functools
+import getpass
+import operator
+import os
+import re
+import tempfile
+
+import torch
+
+
+def conditional_product(*args):
+    return functools.reduce(operator.mul, [x for x in args if x])
+
+
+def ceildiv(numer: int, denom: int) -> int:
+    return -(numer // -denom)
+
+
+def is_power_of_2(n: int) -> bool:
+    """Returns whether n = 2 ** m for some integer m."""
+    return n > 0 and n & n - 1 == 0
+
+
+def next_power_of_2(n: int) -> int:
+    """Return the smallest power of 2 greater than or equal to n"""
+    n -= 1
+    n |= n >> 1
+    n |= n >> 2
+    n |= n >> 4
+    n |= n >> 8
+    n |= n >> 16
+    n |= n >> 32
+    n += 1
+    return n
+
+
+def get_num_bytes(*args: torch.Tensor, num_in_out_args: int = 0) -> int:
+    """
+    Return the total number of bytes the arguments of tensor type takes.
+
+    For in/out args, tensor sizes are counted twice: once for reading and
+    once for writing.
+
+    The first num_in_out_args arguments are in out tensors.
+    """
+    return sum(
+        arg.numel() * arg.element_size() * (1 + int(i < num_in_out_args))
+        for i, arg in enumerate(args)
+        if isinstance(arg, torch.Tensor)
+    )
+
+
+def triton_config_to_hashable(cfg):
+    """
+    Convert triton config to a tuple that can uniquely identify it. We can use
+    the return value as a dictionary key.
+    """
+    items = sorted(cfg.kwargs.items())
+    items.append(("num_warps", cfg.num_warps))
+    items.append(("num_stages", cfg.num_stages))
+    return tuple(items)
+
+
+def validate_triton_config(cfg):
+    # [Note: Triton pre_hook in inductor]
+    # pre-hook is a lambda function, which we don't attempt to serialize.
+    # right now, if a pre-hook is attached to the config, it will not be saved;
+    # and then it won't be used when the config is loaded from cache.
+    # So we assert - if we do get a pre_hook, it might get ignored after caching.
+    assert (
+        getattr(cfg, "pre_hook", None) is None
+    ), "triton configs with pre_hooks not supported"
+
+
+def create_bandwidth_info_str(ms, num_gb, gb_per_s, prefix="", suffix="", color=True):
+    info_str = f"{prefix}{ms:.3f}ms    \t{num_gb:.3f} GB \t {gb_per_s:7.2f}GB/s{suffix}"
+    slow = ms > 0.012 and gb_per_s < 650
+    return red_text(info_str) if color and slow else info_str
+
+
+def get_max_y_grid():
+    return 65535
+
+
+def cache_dir() -> str:
+    cache_dir = os.environ.get("TORCHINDUCTOR_CACHE_DIR")
+    if cache_dir is None:
+        os.environ["TORCHINDUCTOR_CACHE_DIR"] = cache_dir = default_cache_dir()
+    os.makedirs(cache_dir, exist_ok=True)
+    return cache_dir
+
+
+def default_cache_dir():
+    sanitized_username = re.sub(r'[\\/:*?"<>|]', "_", getpass.getuser())
+    return os.path.join(
+        tempfile.gettempdir(),
+        "torchinductor_" + sanitized_username,
+    )
+
+
+try:
+    import colorama
+
+    HAS_COLORAMA = True
+except ModuleNotFoundError:
+    HAS_COLORAMA = False
+    colorama = None  # type: ignore[assignment]
+
+
+def _color_text(msg, color):
+    if not HAS_COLORAMA:
+        return msg
+
+    return getattr(colorama.Fore, color.upper()) + msg + colorama.Fore.RESET
+
+
+def green_text(msg):
+    return _color_text(msg, "green")
+
+
+def yellow_text(msg):
+    return _color_text(msg, "yellow")
+
+
+def red_text(msg):
+    return _color_text(msg, "red")
+
+
+def blue_text(msg):
+    return _color_text(msg, "blue")
+
+
+def get_first_attr(obj, *attrs):
+    """
+    Return the first available attribute or throw an exception if none is present.
+    """
+    for attr in attrs:
+        if hasattr(obj, attr):
+            return getattr(obj, attr)
+
+    raise AssertionError(f"{obj} does not has any of the attributes: {attrs}")
+
+
+try:
+    dynamo_timed = torch._dynamo.utils.dynamo_timed  # type: ignore[has-type]
+except AttributeError:  # Compile workers only have a mock version of torch
+
+    @contextlib.contextmanager
+    def dynamo_timed(key, phase_name=None, fwd_only=True):
+        yield
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a8ff2ba2408d7fce523e8b46e4adf5d65c827ed
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py
@@ -0,0 +1,542 @@
+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+import triton
+import triton.language as tl
+
+
+# In the latest triton, math functions were shuffled around into different modules:
+# https://github.com/openai/triton/pull/3172
+try:
+    from triton.language.extra import libdevice
+
+    libdevice = tl.extra.libdevice  # noqa: F811
+    math = tl.math
+except ImportError:
+    if hasattr(tl.extra, "cuda") and hasattr(tl.extra.cuda, "libdevice"):
+        libdevice = tl.extra.cuda.libdevice
+        math = tl.math
+    elif hasattr(tl.extra, "intel") and hasattr(tl.extra.intel, "libdevice"):
+        libdevice = tl.extra.intel.libdevice
+        math = tl.math
+    else:
+        libdevice = tl.math
+        math = tl
+
+
+try:
+    from triton.language.standard import _log2
+except ImportError:
+
+    def _log2(x):
+        raise NotImplementedError
+
+
+@triton.jit
+def promote_to_tensor(x):
+    # Addition promotes to tensor for us
+    return x + tl.zeros((1,), tl.int1)
+
+
+@triton.jit
+def div_floor_integer(a, b):
+    # NOTE: a // b is C division, but we want floor division
+    # Based on c10::div_floor_integer
+    quot = a // b
+    remainder = a % b
+    fixed = tl.where(remainder != 0, quot - 1, quot)
+    return tl.where((a < 0) != (b < 0), fixed, quot)
+
+
+@triton.jit
+def remainder_integer(a, b):
+    # NOTE: a % b matches C division, not floor division
+    remainder = a % b
+    return tl.where(remainder != 0 and ((a < 0) != (b < 0)), remainder + b, remainder)
+
+
+@triton.jit
+def is_floating(x):
+    return promote_to_tensor(x).dtype.is_floating()
+
+
+@triton.jit
+def _prod_accumulate(a, b):
+    return a * b
+
+
+@triton.jit
+def prod(input, axis):
+    return tl.reduce(input, axis, _prod_accumulate)
+
+
+@triton.jit
+def minimum(a, b):
+    mask = a < b
+    if is_floating(a):
+        mask |= a != a
+    return tl.where(mask, a, b)
+
+
+@triton.jit
+def maximum(a, b):
+    mask = a > b
+    if is_floating(a):
+        mask |= a != a
+    return tl.where(mask, a, b)
+
+
+@triton.jit
+def min2(a, dim):
+    return tl.reduce(a, dim, minimum)
+
+
+@triton.jit
+def max2(a, dim):
+    return tl.reduce(a, dim, maximum)
+
+
+@triton.jit
+def minimum_with_index(a_value, a_index, b_value, b_index):
+    mask = a_value < b_value
+    equal = a_value == b_value
+    if is_floating(a_value):
+        a_isnan = a_value != a_value
+        b_isnan = b_value != b_value
+        mask |= a_isnan and not b_isnan
+        # Consider NaNs as equal
+        equal |= a_isnan and b_isnan
+
+    # Prefer lowest index if values are equal
+    mask |= equal & (a_index < b_index)
+    return tl.where(mask, a_value, b_value), tl.where(mask, a_index, b_index)
+
+
+@triton.jit
+def maximum_with_index(a_value, a_index, b_value, b_index):
+    mask = a_value > b_value
+    equal = a_value == b_value
+    if is_floating(a_value):
+        a_isnan = a_value != a_value
+        b_isnan = b_value != b_value
+        mask |= a_isnan and not b_isnan
+        # Consider NaNs as equal
+        equal |= a_isnan and b_isnan
+
+    # Prefer lowest index if values are equal
+    mask |= equal & (a_index < b_index)
+    return tl.where(mask, a_value, b_value), tl.where(mask, a_index, b_index)
+
+
+@triton.jit
+def min_with_index(value, index, dim):
+    return tl.reduce((value, index), dim, minimum_with_index)
+
+
+@triton.jit
+def max_with_index(value, index, dim):
+    return tl.reduce((value, index), dim, maximum_with_index)
+
+
+@triton.jit
+def welford_reduce(value, mean, m2, weight, first_iteration):
+    if first_iteration:
+        new_weight = tl.full(weight.shape, 1, weight.dtype)
+        new_mean = value
+        new_m2 = tl.zeros_like(m2)
+    else:
+        delta = value - mean
+        new_weight = weight + 1
+        new_mean = mean + delta / new_weight
+        new_m2 = m2 + delta * (value - new_mean)
+    return new_mean, new_m2, new_weight
+
+
+@triton.jit
+def welford_combine(mean_1, m2_1, weight_1, mean_2, m2_2, weight_2):
+    delta = mean_2 - mean_1
+    new_weight = weight_1 + weight_2
+    w2_over_w = tl.where(new_weight == 0.0, 0.0, weight_2 / new_weight)
+    return (
+        mean_1 + delta * w2_over_w,
+        m2_1 + m2_2 + delta * delta * weight_1 * w2_over_w,
+        new_weight,
+    )
+
+
+@triton.jit
+def welford(mean, m2, weight, dim):
+    return tl.reduce((mean, m2, weight), dim, welford_combine)
+
+
+@triton.jit
+def device_assert_then(cond, msg, r):
+    tl.device_assert(cond, msg)
+    return r
+
+
+@triton.jit
+def randint64(seed, offset, low, high):
+    r0, r1, r2, r3 = tl.randint4x(seed, offset)
+    r0 = r0.to(tl.uint64)
+    r1 = r1.to(tl.uint64)
+    result = r0 | (r1 << 32)
+    size = high - low
+    result = result % size.to(tl.uint64)
+    result = result.to(tl.int64) + low
+    return result
+
+
+@triton.jit
+def _any_combine(a, b):
+    return a | b
+
+
+@triton.jit
+def any(a, dim):
+    return tl.reduce(a, dim, _any_combine)
+
+
+@triton.jit
+def bucketize_binary_search(
+    values,  # 1D tensor
+    offsets_ptr,
+    indexing_dtype,
+    right,  # bool: if true, use intervals closed on the left; see [Note: Inductor bucketize op]
+    OFFSETS_SIZE: int,
+    BLOCK_SHAPE,  # tuple/list of block shape
+):
+    """
+    See [Note: Inductor bucketize op]
+    """
+
+    low = tl.zeros(BLOCK_SHAPE, dtype=indexing_dtype)
+    high = tl.full(BLOCK_SHAPE, OFFSETS_SIZE, dtype=indexing_dtype)
+
+    full_range = OFFSETS_SIZE + 1
+    while full_range > 1:
+        mid = (high + low) // 2
+        mask = mid < OFFSETS_SIZE
+        bucket_upper_bound = tl.load(offsets_ptr + mid, mask=mask, other=0.0)
+        if right:
+            is_above = values >= bucket_upper_bound
+        else:
+            is_above = values > bucket_upper_bound
+
+        low = tl.where(is_above & mask, mid + 1, low)
+        high = tl.where(is_above, high, mid)
+
+        full_range = (full_range + 1) // 2
+
+    return low
+
+
+@triton.jit
+def pack_value_flag(
+    value,
+    flag,
+    DTYPE_VALUE_AS_UINT: tl.constexpr,
+    DTYPE_PACK: tl.constexpr,
+):
+    # Workaround for triton bug, tensor.to doesn't unwrap constexpr values
+    DTYPE_VALUE_AS_UINT = tl.core._constexpr_to_value(DTYPE_VALUE_AS_UINT)
+    bitwidth = DTYPE_VALUE_AS_UINT.primitive_bitwidth
+    uv = value.to(DTYPE_VALUE_AS_UINT, bitcast=True).to(DTYPE_PACK)
+    return flag.to(DTYPE_PACK) | (uv << bitwidth)
+
+
+@triton.jit
+def unpack_value(
+    pack,
+    DTYPE_VALUE,
+    DTYPE_VALUE_AS_UINT,
+):
+    # Workaround for triton bug, tensor.to doesn't unwrap constexpr values
+    DTYPE_VALUE = tl.core._constexpr_to_value(DTYPE_VALUE)
+    DTYPE_VALUE_AS_UINT = tl.core._constexpr_to_value(DTYPE_VALUE_AS_UINT)
+    bitwidth = DTYPE_VALUE_AS_UINT.primitive_bitwidth
+    value_uint = (pack >> bitwidth).to(DTYPE_VALUE_AS_UINT)
+    return value_uint.to(DTYPE_VALUE, bitcast=True)
+
+
+@triton.jit
+def unpack_flag(pack, DTYPE_FLAG):
+    return pack.to(DTYPE_FLAG)
+
+
+@triton.jit
+def exclusive_scan_decoupled_lookback(
+    scratch_base,
+    block_value,
+    index,
+    combine_fn,
+    DTYPE_VALUE_AS_UINT: tl.constexpr,
+    DTYPE_PACK: tl.constexpr,
+):
+    """Compute exclusive scan of a scalar value between blocks
+
+    Ref: https://research.nvidia.com/publication/2016-03_single-pass-parallel-prefix-scan-decoupled-look-back
+
+    scratch_base: Pointer to scratch space in global memory
+    block_value: Scalar value for this block
+    index: Scalar index of this block relative to the current scan
+    combine_fn: Function ``(value, value) -> value`` which is scanned over
+    DTYPE_VALUE_AS_UINT: A tl.uint{n} type equal in size to ``block_value``
+    DTYPE_PACK: Unsigned type twice the width of block_value
+
+    NOTE: This function is limited to values which are 32-bits or less because
+    we need to pack (value, flag) into a single unsigned int.
+    """
+    # Publish block sum so subsequent blocks don't get stuck waiting for us
+    DTYPE_VALUE = block_value.dtype
+    pack = pack_value_flag(
+        block_value,
+        tl.full(block_value.shape, 1, DTYPE_VALUE_AS_UINT),
+        DTYPE_VALUE_AS_UINT,
+        DTYPE_PACK,
+    )
+    if index > 0:
+        tl.atomic_xchg(scratch_base + index, pack, sem="relaxed")
+
+    # Calculate exclusive prefix scan
+    exclusive_prefix = tl.zeros([], DTYPE_VALUE)
+    prefix_valid = False
+    test_target = index - 1
+    while test_target >= 0:
+        # tl.atomic_load
+        flag = tl.full([], 0, DTYPE_VALUE_AS_UINT)
+        while flag == 0:
+            pack = tl.atomic_add(scratch_base + test_target, 0, sem="relaxed")
+            flag = unpack_flag(pack, DTYPE_VALUE_AS_UINT)
+
+        value = unpack_value(pack, DTYPE_VALUE, DTYPE_VALUE_AS_UINT)
+        if prefix_valid:
+            exclusive_prefix = combine_fn(value, exclusive_prefix)
+        else:
+            exclusive_prefix = value
+            prefix_valid = True
+
+        if flag == 2:
+            test_target = -1
+        else:
+            test_target = test_target - 1
+
+    # Make inclusive block sum visible to other blocks
+    if prefix_valid:
+        inclusive_prefix = combine_fn(exclusive_prefix, block_value)
+    else:
+        inclusive_prefix = block_value
+    pack = pack_value_flag(
+        inclusive_prefix,
+        tl.full([], 2, DTYPE_VALUE_AS_UINT),
+        DTYPE_VALUE_AS_UINT,
+        DTYPE_PACK,
+    )
+    tl.atomic_xchg(scratch_base + index, pack, sem="relaxed")
+    return exclusive_prefix
+
+
+@triton.jit
+def exclusive_scan_decoupled_lookback_64(scratch_base, block_value, index, combine_fn):
+    """Compute exclusive scan of a scalar value between blocks
+
+    Ref: https://research.nvidia.com/publication/2016-03_single-pass-parallel-prefix-scan-decoupled-look-back
+
+    scratch_base: Pointer to scratch space in global memory
+    block_value: Scalar value for this block, must be 64-bits wide
+    index: Scalar index of this block relative to the current scan
+    combine_fn: Function ``(value, value) -> value`` which is scanned over
+    init: Scalar value equal to the identiy of combine_fn
+    """
+    # Publish block sum so subsequent blocks don't get stuck waiting for us
+    if index > 0:
+        block_value_u64 = block_value.to(tl.uint64, bitcast=True)
+        tl.store(scratch_base + 3 * index + 1, block_value_u64)
+        tl.debug_barrier()
+        flag_one = tl.full([], 1, tl.uint64)
+        tl.atomic_xchg(scratch_base + 3 * index + 0, flag_one, sem="release")
+
+    # Calculate exclusive prefix scan
+    exclusive_prefix = tl.zeros([], block_value.dtype)
+    prefix_valid = False
+    test_target = index - 1
+    while test_target >= 0:
+        flag = tl.full([], 0, tl.uint64)
+        while flag == 0:
+            flag = tl.atomic_add(scratch_base + 3 * test_target + 0, 0, sem="acquire")
+
+        value_u64 = tl.load(scratch_base + 3 * test_target + flag.to(tl.int32))
+        value = value_u64.to(block_value.dtype, bitcast=True)
+        if prefix_valid:
+            exclusive_prefix = combine_fn(value, exclusive_prefix)
+        else:
+            exclusive_prefix = value
+            prefix_valid = True
+
+        if flag == 2:
+            test_target = -1
+        else:
+            test_target = test_target - 1
+
+    # Make inclusive block sum visible to other blocks
+    if prefix_valid:
+        inclusive_prefix = combine_fn(exclusive_prefix, block_value)
+    else:
+        inclusive_prefix = block_value
+    inclusive_prefix_u64 = inclusive_prefix.to(tl.uint64, bitcast=True)
+    tl.store(scratch_base + 3 * index + 2, inclusive_prefix_u64)
+    tl.debug_barrier()
+    flag_two = tl.full([], 2, tl.uint64)
+    tl.atomic_xchg(scratch_base + 3 * index + 0, flag_two, sem="release")
+
+    return exclusive_prefix
+
+
+@triton.jit
+def frexp(x):
+    # TODO(isuruf): use inline_asm_elementwise here
+    y = libdevice.ilogb(x) + 1
+    exponent = tl.where(x == 0, 0, y)
+    mantissa = tl.where(x == 0, 0, libdevice.ldexp(x, -y))
+    return mantissa, exponent
+
+
+@triton.jit
+def _compare_and_swap_with_index(
+    x,
+    idxs,
+    rnumel,
+    flip,
+    i: tl.constexpr,
+    n_dims: tl.constexpr,
+    stable: tl.constexpr,
+    descending: tl.constexpr,
+):
+    n_outer: tl.constexpr = x.numel >> n_dims
+    shape: tl.constexpr = [n_outer * 2**i, 2, 2 ** (n_dims - i - 1)]
+
+    idtype = tl.core.get_int_dtype(bitwidth=x.dtype.primitive_bitwidth, signed=True)
+
+    y = tl.reshape(x, shape)
+    iy = y.to(idtype, bitcast=True)
+    # slice left/right with 'stride' 2**(n_dims - i - 1)
+    right_mask = tl.arange(0, 2)[None, :, None].to(idtype)
+    left_mask = (1 - right_mask).to(idtype)
+    ileft = tl.broadcast_to(tl.sum(iy * left_mask, 1)[:, None, :], shape)
+    iright = tl.broadcast_to(tl.sum(iy * right_mask, 1)[:, None, :], shape)
+    ileft = tl.reshape(ileft, x.shape)
+    iright = tl.reshape(iright, x.shape)
+    left = ileft.to(x.dtype, bitcast=True)
+    right = iright.to(x.dtype, bitcast=True)
+
+    # idx
+    y_idx = tl.reshape(idxs, shape)
+    left_idx = tl.broadcast_to(
+        tl.sum(y_idx * left_mask.to(y_idx.dtype), 1)[:, None, :], shape
+    )
+    right_idx = tl.broadcast_to(
+        tl.sum(y_idx * right_mask.to(y_idx.dtype), 1)[:, None, :], shape
+    )
+    left_idx = tl.reshape(left_idx, x.shape)
+    right_idx = tl.reshape(right_idx, x.shape)
+
+    # valid
+    if rnumel is None:
+        left_valid_mask = tl.full(x.shape, True, tl.int1)
+        right_valid_mask = tl.full(x.shape, True, tl.int1)
+    else:
+        left_valid_mask = left_idx < rnumel
+        right_valid_mask = right_idx < rnumel
+
+    # actual compare-and-swap
+    ix = x.to(idtype, bitcast=True)
+
+    if descending:
+        cond = left < right
+    else:
+        cond = left > right
+
+    if stable:
+        # When stable sorting, tie break by index
+        cond = cond | ((left == right) & (left_idx > right_idx))
+
+    cond = (right_valid_mask > left_valid_mask) | (
+        (right_valid_mask == left_valid_mask) & cond
+    )
+    cond = cond ^ flip
+    ret = ix ^ tl.where(cond, ileft ^ iright, tl.zeros_like(ix))
+    new_idxs = idxs ^ tl.where(cond, left_idx ^ right_idx, tl.zeros_like(idxs))
+
+    return ret.to(x.dtype, bitcast=True), new_idxs
+
+
+@triton.jit
+def _bitonic_merge_with_index(
+    x,
+    idxs,
+    rnumel,
+    stage: tl.constexpr,
+    alternating: tl.constexpr,
+    n_dims: tl.constexpr,
+    stable: tl.constexpr,
+    descending: tl.constexpr,
+):
+    n_outer: tl.constexpr = x.numel >> n_dims
+    tl.static_assert(stage <= n_dims)
+    # flip denotes whether to re-arrange sub-sequences of elements in ascending or
+    # descending order.
+    # if flip = 00000000... then all elements will be re-arranged ascendingly at this stage
+    # if flip = 00110011... then all the elements will be re-arranged alternatingly (with
+    # a stride of 2) at this stage
+    if alternating:
+        shape: tl.constexpr = [n_outer * 2 ** (n_dims - 1 - stage), 2, 2**stage]
+        flip = tl.reshape(
+            tl.broadcast_to(tl.arange(0, 2)[None, :, None], shape), x.shape
+        )
+    else:
+        flip = False
+    # perform `stage` rounds of `compare-and-swap`
+    for i in tl.static_range(stage):
+        x, idxs = _compare_and_swap_with_index(
+            x, idxs, rnumel, flip, i + (n_dims - stage), n_dims, stable, descending
+        )
+    return x, idxs
+
+
+@triton.jit
+def sort_with_index(
+    x,  # value
+    idxs,  # index
+    rnumel,  # number of elements
+    dim: tl.constexpr = None,
+    stable: tl.constexpr = tl.constexpr(False),
+    descending: tl.constexpr = tl.constexpr(False),
+):
+    x, idxs = tl.broadcast(x, idxs)
+    # handle default dimension or check that it is the most minor dim
+    _dim: tl.constexpr = len(x.shape) - 1 if dim is None else dim
+    tl.static_assert(
+        _dim == len(x.shape) - 1, "only minor dimension is currently supported"
+    )
+    # iteratively run bitonic merge-sort steps
+    n_dims: tl.constexpr = _log2(x.shape[_dim])
+
+    for i in tl.static_range(1, n_dims + 1):
+        x, idxs = _bitonic_merge_with_index(
+            x,
+            idxs,
+            rnumel,
+            i,
+            alternating=i < n_dims,
+            n_dims=n_dims,
+            stable=stable,
+            descending=descending,
+        )
+    return x, idxs
+
+
+@triton.jit
+def select_one(x, mask, dim, keep_dims=False):
+    idtype = tl.core.get_int_dtype(x.dtype.primitive_bitwidth, signed=False)
+    ix = x.to(idtype, bitcast=True)
+    iy = tl.sum(ix * mask, dim, keep_dims=keep_dims)
+    return iy.to(x.dtype, bitcast=True)
diff --git a/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/triton_heuristics.py b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/triton_heuristics.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddc2f9afc8d61f61f81056c64f1b85688c0adaf3
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/_inductor/runtime/triton_heuristics.py
@@ -0,0 +1,1807 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+import builtins
+import copy
+import functools
+import hashlib
+import inspect
+import logging
+import math
+import operator
+import os
+import os.path
+import re
+import sys
+import threading
+import time
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+import torch
+
+from .autotune_cache import AutotuneCache
+from .benchmarking import benchmarker
+from .coordinate_descent_tuner import CoordescTuner
+from .hints import (
+    _NUM_THREADS_PER_WARP,
+    AutotuneHint,
+    DeviceProperties,
+    HeuristicType,
+    ReductionHint,
+    TileHint,
+    TRITON_MAX_BLOCK,
+)
+from .runtime_utils import (
+    cache_dir,
+    ceildiv,
+    conditional_product,
+    create_bandwidth_info_str,
+    dynamo_timed,
+    get_first_attr,
+    get_max_y_grid,
+    get_num_bytes,
+    next_power_of_2,
+    triton_config_to_hashable,
+    validate_triton_config,
+)
+
+
+try:
+    import triton
+except ImportError:
+    triton = None
+
+if triton is not None:
+    from triton import Config
+    from triton.compiler import CompiledKernel
+    from triton.runtime.autotuner import OutOfResources
+    from triton.runtime.jit import KernelInterface
+
+    try:
+        from triton.compiler.compiler import ASTSource
+    except ImportError:
+        ASTSource = None
+
+    try:
+        from triton.backends.compiler import GPUTarget
+    except ImportError:
+        GPUTarget = None
+else:
+    Config = object
+    KernelInterface = object
+    OutOfResources = object
+    ASTSource = None
+    GPUTarget = None
+
+try:
+    autograd_profiler = torch.autograd.profiler
+except AttributeError:  # Compile workers only have a mock version of torch
+
+    class autograd_profiler:  # type: ignore[no-redef]
+        _is_profiler_enabled = False
+
+
+log = logging.getLogger(__name__)
+
+
+def autotune_hints_to_configs(
+    hints: Set[AutotuneHint], size_hints, block_size: int
+) -> List[Config]:
+    """
+    AutotuneHints can be attached to the metadata of triton kernels for providing
+    suggestions about what to try for autotuning. One reason to do this is if there are
+    some configs that are only useful in specific scenarios, in which case we can avoid
+    wasting compile time on autotuning unless we know we are in one of those scenarios.
+
+    Based on those hints, this function will generate a list of additional autotuning
+    configs to try.
+    """
+    xyz_options: Tuple[Tuple[int, Optional[int], Optional[int]], ...]
+    configs = []
+
+    for hint in hints:
+        if hint == AutotuneHint.ELEMENTS_PER_WARP_32:
+            if len(size_hints) == 1:
+                xyz_options = ((block_size // 4, None, None),)
+            elif len(size_hints) == 2:
+                xyz_options = ((block_size // 4, 1, None), (1, block_size // 4, None))
+            elif len(size_hints) == 3:
+                xyz_options = (
+                    (block_size // 4, 1, 1),
+                    (1, block_size // 4, 1),
+                    (1, 1, block_size // 4),
+                )
+            for xyz in xyz_options:
+                configs.append(
+                    triton_config(
+                        size_hints,
+                        *xyz,
+                        num_elements_per_warp=32,
+                    )
+                )
+
+    return configs
+
+
+def disable_pointwise_autotuning(inductor_meta):
+    # Autotuning can give different benchmarking results from run to run, and
+    # therefore we disable autotuning when use_deterministic flag is on.
+    if inductor_meta.get("are_deterministic_algorithms_enabled"):
+        return True
+    return not inductor_meta.get("autotune_pointwise", True)
+
+
+def _dump_launch_params(args, kwargs, launcher, kernel_name):
+    call_args = []
+    call_kwargs = {}
+    for arg in args:
+        if isinstance(arg, (int, bool)):
+            call_args.append(str(arg))
+        else:
+            call_args.append("T")
+    for k, v in kwargs.items():
+        if isinstance(arg, (int, bool)):
+            call_kwargs[k] = v
+        else:
+            call_kwargs[k] = v
+    for k, v in launcher.config.kwargs.items():
+        call_kwargs[k] = v
+    call_kwargs["num_warps"] = launcher.config.num_warps
+    call_kwargs["num_stages"] = launcher.config.num_stages
+    args_str = ""
+    args_str += ", ".join(call_args)
+    for k, v in call_kwargs.items():
+        args_str += f", {k}={v}"
+
+    abs_path = os.path.abspath(sys.argv[0])
+    with open(f"{abs_path}.launch_params", "a") as f:
+        f.write(f"{kernel_name} | {args_str}\n")
+
+
+class CachingAutotuner(KernelInterface):
+    """
+    Simplified version of Triton autotuner that has no invalidation
+    key and caches the best config to disk to improve cold start times.
+    Unlike the main triton Autotuner, this version can precompile all
+    configs, and does not rely on the Triton JIT.
+    """
+
+    def __init__(
+        self,
+        fn,
+        triton_meta,  # passed directly to triton
+        configs,
+        save_cache_hook,
+        mutated_arg_names: List[str],  # see [Note: clone mutated buffers]
+        heuristic_type,
+        size_hints=None,
+        inductor_meta=None,  # metadata not relevant to triton
+        custom_kernel=False,  # whether the kernel is inductor-generated or custom
+        filename: Optional[str] = None,
+    ):
+        super().__init__()
+
+        assert len(configs) > 0, "Non-empty TritonConfig list required for compiling"
+        # makes sure there are no pre-hooks on any of the triton configs
+        for cfg in configs:
+            validate_triton_config(cfg)
+
+        self.fn = fn
+        self.device_props: DeviceProperties = triton_meta["device"]
+        self.triton_meta = {
+            **triton_meta,
+            "device": self.device_props.index,
+            "device_type": self.device_props.type,
+        }
+        self.inductor_meta = {} if inductor_meta is None else inductor_meta
+        self.save_cache_hook = save_cache_hook
+        self.mutated_arg_names = mutated_arg_names
+        self.configs = configs
+        self.heuristic_type = heuristic_type
+        self.custom_kernel = custom_kernel
+        self.cuda_kernel_saved = False
+        if log.isEnabledFor(logging.DEBUG):
+            log.debug(
+                "CachingAutotuner gets %d configs for %s",
+                len(self.configs),
+                self.fn.__name__,
+            )
+            for c in self.configs:
+                log.debug(c)
+
+        self.launchers = []  # type: ignore[var-annotated]
+        self.lock = threading.Lock()
+        if os.getenv("TRITON_CACHE_DIR") is None:
+            os.environ["TRITON_CACHE_DIR"] = os.path.join(
+                cache_dir(),
+                "triton",
+                str(self.triton_meta.get("device", 0)),
+            )
+        log.debug("Triton cache dir: %s", os.environ["TRITON_CACHE_DIR"])
+
+        self.size_hints = size_hints
+        self.coordesc_tuner = CoordescTuner(
+            is_mm=False,
+            name=self.fn.__name__,
+            size_hints=size_hints,
+            inductor_meta=self.inductor_meta,
+        )
+        self.filename = filename
+
+        self.precompile_time_taken_ns = 0
+        self.autotune_time_taken_ns = 0
+
+    def precompile(self, warm_cache_only=False):
+        with self.lock:
+            if self.launchers:
+                return
+            self.launchers = []
+            compiled_binaries = []
+            if not self.configs:
+                raise RuntimeError("No triton configs are available")
+            for c in self.configs:
+                try:
+                    compiled_binary, launcher = self._precompile_config(
+                        c, warm_cache_only
+                    )
+                except OutOfResources as e:
+                    if len(self.configs) == 1:
+                        # There are no valid Triton configs
+                        raise e
+                    # Skip the config if we run out of resource
+                    continue
+                self.launchers.append(launcher)
+                compiled_binaries.append(compiled_binary)
+
+            if len(self.launchers) == 0:
+                raise RuntimeError(
+                    "No valid triton configs. Report a fatal compilation error"
+                )
+
+            seen_configs = set(self.configs)
+
+            device_prop = self.device_props
+            if (
+                self.inductor_meta.get("dynamic_scale_rblock", True)
+                and self.heuristic_type == HeuristicType.REDUCTION
+                and self.size_hints is not None
+                # Disable for AMDGPU/Intel as Triton is not ready to return n_regs for a compiled_binary.
+                and device_prop.type == "cuda"
+                and device_prop.major
+                and device_prop.major >= 8
+            ):
+                assert device_prop.regs_per_multiprocessor
+                assert device_prop.max_threads_per_multi_processor
+                assert device_prop.multi_processor_count
+                for triton_config, compiled_binary in zip(
+                    self.configs, compiled_binaries
+                ):
+                    assert len(self.size_hints) == 2
+                    xblock = triton_config.kwargs.get("XBLOCK", 1)
+                    rblock = triton_config.kwargs["RBLOCK"]
+                    total_block = (self.size_hints[0] + xblock - 1) // xblock
+                    nreg = getattr(compiled_binary, "n_regs", None)
+                    if nreg is None:
+                        continue
+
+                    # make sure rblock is not too small
+                    if rblock <= 64:
+                        continue
+
+                    # each SM of A100 has 65536 32-bit registers. To maximize
+                    # the theoretical occupancy, we need run 2048 threads on each
+                    # SM. So each thread should use no more than 65536 / 2048
+                    # = 32 registers. In cases where occupancy matters, and each
+                    # thread uses too many registers, reduce RBLOCK to reduce
+                    # the register usage.
+                    # For kernel https://gist.github.com/shunting314/e4cccc031fe30d378b9b23c08c238cbd
+                    # from PLBartForCausalLM, latency improve from
+                    # 7.795ms to 4.883ms.
+                    #
+                    if (
+                        nreg
+                        <= device_prop.regs_per_multiprocessor
+                        // device_prop.max_threads_per_multi_processor
+                    ):
+                        continue
+
+                    nreg_per_warp = nreg * 32
+                    nreg_per_block = nreg_per_warp * triton_config.num_warps
+
+                    # Previously we set max_blocks_per_sm to 'max_threads_per_multi_processo / (32 * num_warps)'
+                    # The formula below is a tighter upper bound since we have the assumption that
+                    #   nreg > device_prop.regs_per_multiprocessor // device_prop.max_threads_per_multi_processor
+                    # due to the if condition above and:
+                    #   regs_per_multiprocessor / nreg_per_block
+                    #   = regs_per_multiprocessor / (nreg * 32 * num_warps)
+                    #   < regs_per_multiprocessor / ((regs_per_multiprocessor / max_threads_per_multi_processor) * 32 * num_warps)
+                    #   = max_threads_per_multi_processor / (32 * num_warps)
+                    # Using a tigher upper bound can reveal more optimization opportunities.
+                    max_blocks_per_sm = max(
+                        device_prop.regs_per_multiprocessor // nreg_per_block, 1
+                    )
+
+                    if (
+                        total_block
+                        <= max_blocks_per_sm * device_prop.multi_processor_count
+                    ):
+                        # no need to improve occupancy
+                        continue
+                    new_config = copy.deepcopy(triton_config)
+                    new_config.kwargs["RBLOCK"] = rblock // 2
+                    if new_config in seen_configs:
+                        continue
+                    seen_configs.add(new_config)
+                    log.debug(
+                        "Dynamically scale down RBLOCK from TritonConfig(%s) and get a new TritonConfig(%s)",
+                        triton_config,
+                        new_config,
+                    )
+                    self.launchers.append(
+                        self._precompile_config(new_config, warm_cache_only)[1]
+                    )
+            self.configs = None
+
+    def get_device_interface(self):
+        # this code cannot run in compile workers, because it imports from torch
+        from torch._dynamo.device_interface import get_interface_for_device
+
+        return get_interface_for_device(self.device_props.type.replace("hip", "cuda"))
+
+    def _precompile_config(self, cfg: Config, warm_cache_only: bool):
+        """Ahead of time compile a given autotuner config."""
+        compile_meta = copy.deepcopy(self.triton_meta)
+        for k, v in cfg.kwargs.items():
+            if self.device_props.type == "hip":
+                if k == "matrix_instr_nonkdim":
+                    compile_meta["matrix_instr_nonkdim"] = v
+                    continue
+                if k == "waves_per_eu":
+                    compile_meta["waves_per_eu"] = v
+                    continue
+            compile_meta["constants"][self.fn.arg_names.index(k)] = v
+        compile_meta["num_warps"] = cfg.num_warps
+        compile_meta["num_stages"] = cfg.num_stages
+        compile_meta["debug"] = self.inductor_meta.get(
+            "assert_indirect_indexing", True
+        ) and not self.inductor_meta.get("is_hip", False)
+
+        # device type will be "hip" rather than "cuda" here
+        compile_meta["device_type"] = self.device_props.type
+        compile_meta["cc"] = self.device_props.cc
+
+        if ASTSource:
+            compile_args = (
+                ASTSource(
+                    self.fn,
+                    compile_meta["signature"],
+                    compile_meta["constants"],
+                    compile_meta["configs"][0],
+                ),
+            )
+
+            cc_str = str(compile_meta["cc"])
+            if "gfx10" in cc_str or "gfx11" in cc_str:
+                rocm_warp_size = 32
+            else:
+                rocm_warp_size = 64
+
+            if GPUTarget:
+                target = GPUTarget(
+                    compile_meta["device_type"],
+                    compile_meta["cc"],
+                    rocm_warp_size if torch.version.hip else 32,
+                )
+            else:
+                target = (
+                    (compile_meta["device_type"], compile_meta["cc"])
+                    if not torch.version.hip
+                    else [
+                        compile_meta["device_type"],
+                        compile_meta["cc"],
+                        rocm_warp_size,
+                    ]
+                )
+
+            options = {
+                "num_warps": compile_meta["num_warps"],
+                "num_stages": compile_meta["num_stages"],
+                "debug": compile_meta["debug"],
+            }
+            if self.device_props.type == "hip":
+                if "waves_per_eu" in compile_meta:
+                    options["waves_per_eu"] = compile_meta["waves_per_eu"]
+                if "matrix_instr_nonkdim" in compile_meta:
+                    options["matrix_instr_nonkdim"] = compile_meta[
+                        "matrix_instr_nonkdim"
+                    ]
+            compile_kwargs = {
+                "target": target,
+                "options": options,
+            }
+        else:
+            compile_args = (self.fn,)
+            compile_kwargs = compile_meta
+
+        if warm_cache_only:
+            return (
+                triton.compile(*compile_args, **compile_kwargs),
+                None,
+            )
+
+        # importing from torch is safe now that precompile has returned
+        from torch._dynamo.device_interface import DeviceGuard
+
+        device_interface = self.get_device_interface()
+
+        # load binary to the correct device
+        with DeviceGuard(device_interface, compile_meta["device"]):  # type: ignore[attr-defined]
+            # need to initialize context
+            device_interface.synchronize(device_interface.current_device())
+
+            try:
+                binary = triton.compile(*compile_args, **compile_kwargs)
+            except Exception:
+                log.exception(
+                    "Triton compilation failed: %s\n%s\nmetadata: %s",
+                    self.inductor_meta.get("kernel_name", "triton_"),
+                    self.fn.src,
+                    compile_meta,
+                )
+                raise
+            binary._init_handles()
+
+        call_args = [
+            arg
+            for i, arg in enumerate(self.fn.arg_names)
+            if i not in self.fn.constexprs
+        ]
+        def_args = [name for name in self.fn.arg_names if name not in cfg.kwargs]
+
+        binary_shared = (
+            binary.shared if hasattr(binary, "shared") else binary.metadata.shared
+        )
+
+        scope = {
+            "grid_meta": cfg.kwargs,
+            "bin": binary,
+            "launch_enter_hook": CompiledKernel.launch_enter_hook,
+            "launch_exit_hook": CompiledKernel.launch_exit_hook,
+            "metadata": binary.packed_metadata
+            if hasattr(binary, "packed_metadata")
+            else binary.metadata,
+            "shared": binary_shared,
+        }
+
+        scope["num_warps"] = (
+            binary.num_warps
+            if hasattr(binary, "num_warps")
+            else binary.metadata.num_warps
+        )
+
+        scope["cta_args"] = (
+            (binary.num_ctas, *get_first_attr(binary, "cluster_dims", "clusterDims"))
+            if hasattr(binary, "num_ctas")
+            else (
+                (binary.metadata.num_ctas, *binary.metadata.cluster_dims)
+                if hasattr(binary, "metadata")
+                else ()
+            )
+        )
+
+        scope["function"] = get_first_attr(binary, "function", "cu_function")
+
+        def get_launch_args_without_kernel_launch_metadata(
+            grid,
+            grid_0,
+            grid_1,
+            grid_2,
+            stream,
+            function,
+            metadata,
+            bin,
+            launch_enter_hook,
+            launch_exit_hook,
+            num_warps,
+            shared,
+            cta_args,
+            args,
+        ):
+            """
+            Construct launch args before CompiledKernel.launch_metadata is added.
+            """
+            return (
+                grid_0,
+                grid_1,
+                grid_2,
+                num_warps,
+                *cta_args,
+                shared,
+                stream,
+                function,
+                launch_enter_hook,
+                launch_exit_hook,
+                metadata,
+            )
+
+        # Getting the kernel launch args is extremely perf-sensitive.  Evaluating
+        # `bin.launch_metadata` is relatively expensive, and returns None unless a
+        # `launch_enter_hook` is installed.  So if we don't have that hook installed,
+        # we want to burn None in to the launch args with zero overhead.
+        # See https://github.com/pytorch/pytorch/issues/123597
+        if binary.launch_enter_hook:
+
+            def get_launch_args_with_kernel_launch_metadata(
+                grid,
+                grid_0,
+                grid_1,
+                grid_2,
+                stream,
+                function,
+                metadata,
+                bin,
+                launch_enter_hook,
+                launch_exit_hook,
+                num_warps,
+                shared,
+                cta_args,
+                args,
+            ):
+                """
+                Construct launch args after CompiledKernel.launch_metadata is added
+                by https://github.com/openai/triton/pull/3492 .
+                """
+                return (
+                    grid_0,
+                    grid_1,
+                    grid_2,
+                    stream,
+                    function,
+                    metadata,
+                    bin.launch_metadata(grid, stream, *args),
+                    launch_enter_hook,
+                    launch_exit_hook,
+                )
+
+        else:
+
+            def get_launch_args_with_kernel_launch_metadata(
+                grid,
+                grid_0,
+                grid_1,
+                grid_2,
+                stream,
+                function,
+                metadata,
+                bin,
+                launch_enter_hook,
+                launch_exit_hook,
+                num_warps,
+                shared,
+                cta_args,
+                args,
+            ):
+                """
+                Construct launch args after CompiledKernel.launch_metadata is added
+                by https://github.com/openai/triton/pull/3492 .
+                """
+                return (
+                    grid_0,
+                    grid_1,
+                    grid_2,
+                    stream,
+                    function,
+                    metadata,
+                    None,
+                    launch_enter_hook,
+                    launch_exit_hook,
+                )
+
+        scope["get_launch_args"] = (
+            get_launch_args_with_kernel_launch_metadata
+            if hasattr(binary, "launch_metadata")
+            else get_launch_args_without_kernel_launch_metadata
+        )
+
+        scope["runner"] = get_first_attr(binary, "run", "c_wrapper")
+
+        exec(
+            f"""
+            def launcher({', '.join(def_args)}, grid, stream):
+                if callable(grid):
+                    grid_0, grid_1, grid_2 = grid(grid_meta)
+                else:
+                    grid_0, grid_1, grid_2 = grid
+
+                args = {', '.join(call_args)},
+                launch_args = get_launch_args(
+                    grid, grid_0, grid_1, grid_2, stream, function,
+                    metadata, bin, launch_enter_hook, launch_exit_hook,
+                    num_warps, shared, cta_args, args
+                )
+                runner(*launch_args, *args)
+                return bin
+            """.lstrip(),
+            scope,
+        )
+
+        launcher = scope["launcher"]
+        launcher.config = cfg
+        launcher.n_regs = getattr(binary, "n_regs", None)
+        launcher.n_spills = getattr(binary, "n_spills", None)
+        launcher.shared = binary_shared
+        launcher.store_cubin = self.inductor_meta.get("store_cubin", False)
+        # store this global variable to avoid the high overhead of reading it when calling run
+        if launcher.store_cubin:
+            launcher.fn = self.fn
+            launcher.bin = binary
+
+        return binary, launcher
+
+    def bench(self, launcher, *args, grid, with_profiler=False, **kwargs):
+        """Measure the performance of a given launcher"""
+        # we don't skip configs wiht spilled registers when auto-tuning custom
+        # (user-written) Triton kernels, as (i) we don't have any knowledge or
+        # control over the kernel code; (ii) there is empirical evidence that
+        # for some (complicated) custom Triton kernels, a register-spilling
+        # config may yield the best latency.
+        if not self.custom_kernel and launcher.n_spills > self.inductor_meta.get(
+            "spill_threshold", 16
+        ):
+            log.debug(
+                "Skip config %s because of register spilling: %d",
+                launcher.config,
+                launcher.n_spills,
+            )
+            return float("inf")
+
+        device_interface = self.get_device_interface()
+        stream = device_interface.get_raw_stream(device_interface.current_device())
+
+        def kernel_call():
+            cloned_args, cloned_kwargs = self.clone_args(*args, **kwargs)
+            launcher(
+                *cloned_args,
+                **cloned_kwargs,
+                grid=grid,
+                stream=stream,
+            )
+
+        if with_profiler:
+            from torch._inductor.utils import do_bench_using_profiling
+
+            return do_bench_using_profiling(kernel_call, warmup=10, rep=40)
+
+        return benchmarker.benchmark_gpu(kernel_call, rep=40, fast_flush=True)
+
+    def clone_args(self, *args, **kwargs) -> Tuple[List[Any], Dict[str, Any]]:
+        from ..compile_fx import clone_preserve_strides
+
+        # [Note: clone mutated buffers]
+        # clone inplace buffers to avoid autotune contaminating them if
+        # the kernel does in-place stores. avoid cloning other buffers because
+        # it leads to increase memory use
+        cloned_args = []
+        for i, arg in enumerate(args):
+            if self.fn.arg_names[i] in self.mutated_arg_names:
+                assert isinstance(arg, torch.Tensor)
+                cloned_args.append(clone_preserve_strides(arg))
+            else:
+                cloned_args.append(arg)
+
+        cloned_kwargs: Dict[str, Any] = {}
+        for name, arg in kwargs.items():
+            if name in self.mutated_arg_names:
+                assert isinstance(arg, torch.Tensor)
+                cloned_kwargs[name] = clone_preserve_strides(arg)
+            else:
+                cloned_kwargs[name] = arg
+
+        return cloned_args, cloned_kwargs
+
+    def benchmark_all_configs(self, *args, **kwargs):
+        with dynamo_timed("CachingAutotuner.benchmark_all_configs"):
+            timings = {
+                launcher: self.bench(launcher, *args, **kwargs)
+                for launcher in self.launchers
+            }
+
+            for k, v in timings.items():
+                self.coordesc_tuner.cache_benchmark_result(k.config, v)
+
+            if log.isEnabledFor(logging.DEBUG):
+                log.debug("Benchmark all input configs for %s, get:", self.fn.__name__)
+                for k, v in timings.items():
+                    log.debug(
+                        "%s: %f, nreg %d, nspill %d, #shared-mem %s",
+                        k.config,
+                        v,
+                        k.n_regs,
+                        k.n_spills,
+                        k.shared,
+                    )
+
+            return timings
+
+    def autotune_to_one_config(self, *args, **kwargs):
+        """Do the actual autotuning"""
+        start_time = time.time_ns()
+        timings = self.benchmark_all_configs(*args, **kwargs)
+        benchmark_time_taken_ns = time.time_ns() - start_time
+        self.launchers = [builtins.min(timings, key=timings.get)]
+        self.autotune_time_taken_ns = (
+            self.precompile_time_taken_ns + benchmark_time_taken_ns
+        )
+        if self.save_cache_hook:
+            self.save_cache_hook(self.launchers[0].config, self.autotune_time_taken_ns)
+
+    def save_gpu_kernel(self, grid, stream, launcher):
+        if callable(grid):
+            grid_x, grid_y, grid_z = grid(launcher.config.kwargs)
+        else:
+            grid_x, grid_y, grid_z = grid
+
+        key = self.inductor_meta.get("kernel_name", None)  # unique kernel name
+        assert key is not None, "kernel_name can not be None"
+        params = {
+            "mangled_name": launcher.bin.metadata.name
+            if hasattr(launcher.bin.metadata, "name")
+            else launcher.bin.metadata["name"],
+            "grid_x": grid_x,
+            "grid_y": grid_y,
+            "grid_z": grid_z,
+            "x_block": launcher.config.kwargs.get("XBLOCK", 1),
+            "y_block": launcher.config.kwargs.get("YBLOCK", None),
+            "z_block": launcher.config.kwargs.get("ZBLOCK", None),
+            "num_warps": launcher.bin.num_warps
+            if hasattr(launcher.bin, "num_warps")
+            else launcher.bin.metadata.num_warps,
+            "shared_mem": launcher.bin.shared
+            if hasattr(launcher.bin, "shared")
+            else launcher.bin.metadata.shared,
+            "stream": stream,
+            # User defined triton kernels will have arbitrary kwarg names
+            "meta": launcher.config.kwargs,
+        }
+        from torch._inductor.codecache import CudaKernelParamCache
+
+        bin_type = {"hip": "hsaco", "xpu": "spv"}.get(self.device_props.type, "cubin")
+        binary = launcher.bin.asm[bin_type]
+        CudaKernelParamCache.set(key, params, binary, bin_type)
+
+        self.cuda_kernel_saved = True
+
+    def coordinate_descent_tuning(self, launcher, *args, **kwargs):
+        """
+        Coordinate descent tuning can be run with or without max-autotune.
+
+        The only difference between these two is the starting config for coordinate_descent tuning.
+        E.g., assuming regular autotune only get one config C1; while max-autotune get 4 configs C1, C2, C3, C4
+        and max-autotune figure out C3 is the best.
+
+        Then if coordinate desecnt tuning is run with max-autotune disabled, it will start from C1;
+        while if coordinate descent tuning is run with max-autotune enabled, it will start from C3.
+        """
+        if (
+            self.heuristic_type == HeuristicType.TEMPLATE
+            or self.heuristic_type == HeuristicType.USER_AUTOTUNE
+        ):
+            # skip triton template
+            return launcher
+
+        config2launcher = {launcher.config: launcher}
+
+        def benchmark_one_config(config):
+            with self.lock:
+                _, launcher = self._precompile_config(config, False)
+            config2launcher[config] = launcher
+
+            out = self.bench(launcher, *args, **kwargs)
+            log.debug(
+                "COORDESC: %s: %f, nreg %d, nspill %d, #shared-mem %d",
+                launcher.config,
+                out,
+                launcher.n_regs,
+                launcher.n_spills,
+                launcher.shared,
+            )
+            return out
+
+        assert not (
+            self.heuristic_type == HeuristicType.PERSISTENT_REDUCTION
+            and "RBLOCK" in launcher.config.kwargs
+        ), "Coordinate descent tuner relies on the assumption that persistent reduction's triton config does not have RBLOCK"
+        start_time = time.time_ns()
+        best_config = self.coordesc_tuner.autotune(
+            benchmark_one_config, launcher.config, None
+        )
+        coordesc_time_taken_ns = time.time_ns() - start_time
+        best_config.found_by_coordesc = True
+
+        if self.save_cache_hook:
+            self.save_cache_hook(
+                best_config,
+                self.autotune_time_taken_ns + coordesc_time_taken_ns,
+                found_by_coordesc=True,
+            )
+        return config2launcher.get(best_config)
+
+    def run(self, *args, grid, stream, **kwargs):
+        if len(self.launchers) != 1:
+            if len(self.launchers) == 0:
+                start_time = time.time_ns()
+                self.precompile()
+                self.precompile_time_taken_ns = time.time_ns() - start_time
+            if len(self.launchers) > 1:
+                self.autotune_to_one_config(*args, grid=grid, **kwargs)
+
+        if not getattr(
+            self.launchers[0].config, "found_by_coordesc", False
+        ) and self.inductor_meta.get("coordinate_descent_tuning", False):
+            self.launchers = [
+                self.coordinate_descent_tuning(
+                    self.launchers[0], *args, grid=grid, **kwargs
+                )
+            ]
+
+        (launcher,) = self.launchers
+        if launcher.store_cubin:
+            self.save_gpu_kernel(grid, stream, launcher)
+
+        if os.environ.get("TORCHINDUCTOR_DUMP_LAUNCH_PARAMS", 0) == "1":
+            _dump_launch_params(args, kwargs, launcher, self.fn.__name__)
+
+        # it is faster than entering and exiting a context manager, even if the context
+        # manager is a nullcontext.
+        if autograd_profiler._is_profiler_enabled:
+            # grid can be a tuple of ints or a string.
+            if isinstance(grid, tuple):
+                grid_info = str(grid)
+            else:
+                grid_info = getattr(grid, "grid_fn_str", "")
+            with torch._C._profiler._RecordFunctionFast(
+                self.inductor_meta.get("kernel_name", "triton kernel"),
+                args,
+                {
+                    "kernel_file": "" if self.filename is None else self.filename,
+                    "kernel_backend": "triton",
+                    "grid": grid_info,
+                    "stream": stream,
+                },
+            ):
+                return launcher(
+                    *args,
+                    **kwargs,
+                    grid=grid,
+                    stream=stream,
+                )
+        else:
+            return launcher(
+                *args,
+                **kwargs,
+                grid=grid,
+                stream=stream,
+            )
+
+
+def _find_names(obj):
+    import gc
+    import inspect
+
+    frame = inspect.currentframe()
+    while frame is not None:
+        frame.f_locals
+        frame = frame.f_back
+    obj_names = []
+    for referrer in gc.get_referrers(obj):
+        if isinstance(referrer, dict):
+            for k, v in referrer.items():
+                if v is obj:
+                    obj_names.append(k)
+    return obj_names
+
+
+collected_calls: List[Any] = []
+
+
+def start_graph():
+    collected_calls.clear()
+
+
+def end_graph(output_file):
+    if len(collected_calls) == 0:
+        return
+    overall_time = sum(call[0] for call in collected_calls)
+    overall_gb = sum(call[1] for call in collected_calls)
+    cur_file = inspect.stack()[1].filename
+    summary_str = (
+        f"SUMMARY ({cur_file})\n"
+        f"{overall_time:.2f}ms   \t {overall_gb:.2f} GB\t {overall_gb/(overall_time/1e3):.2f}GB/s"
+    )
+    print(summary_str)
+    print()
+    if output_file is not None:
+        # sort perf numbers in descending order, i.e. placing the
+        # most runtime-heavy kernels at the top of the list
+        sorted_calls = sorted(collected_calls, key=lambda c: float(c[0]), reverse=True)
+        try:
+            with open(output_file, "a") as file:
+                log.debug("Save profile bandwidth results to %s", output_file)
+                file.write("====================\n")
+                file.write(f"TRITON KERNELS BANDWIDTH INFO ({cur_file})\n")
+                for ms, num_gb, gb_per_s, kernel_name in sorted_calls:
+                    # also display the runtime percentage for each kernel
+                    percentage = f"{ms/overall_time*100:.2f}%"
+                    suffix = f" \t {percentage} \t {kernel_name}"
+                    bw_info_str = create_bandwidth_info_str(
+                        ms,
+                        num_gb,
+                        gb_per_s,
+                        suffix=suffix,
+                        color=False,
+                    )
+                    file.write(bw_info_str + "\n")
+                file.write(f"{summary_str}\n\n")
+        except Exception as e:
+            log.warning(
+                "failed to write profile bandwidth result into %s: %s",
+                output_file,
+                e,
+            )
+
+
+class DebugAutotuner(CachingAutotuner):
+    def __init__(self, *args, regex_filter="", with_profiler=False, **kwargs):
+        self.regex_filter = regex_filter
+        self.with_profiler = with_profiler
+        super().__init__(*args, **kwargs)
+        self.cached = None
+
+    def run(self, *args, grid, stream):
+        possible_names = _find_names(self)
+        kernel_name = f"{max(possible_names, key=len)}"
+        if not re.match(self.regex_filter, kernel_name):
+            return
+        super().run(*args, grid=grid, stream=stream)
+        (launcher,) = self.launchers
+
+        if self.cached is None:
+            ms = self.bench(
+                launcher, *args, grid=grid, with_profiler=self.with_profiler
+            )
+            num_in_out_ptrs = len(
+                [
+                    arg_name
+                    for arg_name in self.fn.arg_names
+                    if arg_name.startswith("in_out_ptr")
+                ]
+            )
+            num_gb = self.inductor_meta.get("kernel_num_gb", None)
+            if num_gb is None:
+                num_gb = get_num_bytes(*args, num_in_out_args=num_in_out_ptrs) / 1e9
+            gb_per_s = num_gb / (ms / 1e3)
+            self.cached = ms, num_gb, gb_per_s, kernel_name
+            collected_calls.append((ms, num_gb, gb_per_s, kernel_name))
+            print(
+                create_bandwidth_info_str(
+                    ms, num_gb, gb_per_s, suffix=f" \t {kernel_name}"
+                )
+            )
+
+
+def hash_configs(configs: List[Config]):
+    """
+    Hash used to check for changes in configurations
+    """
+    hasher = hashlib.sha256()
+    for cfg in configs:
+        hasher.update(
+            f"{sorted(cfg.kwargs.items())} {cfg.num_warps} {cfg.num_stages}\n".encode()
+        )
+    return hasher.hexdigest()
+
+
+def cached_autotune(
+    size_hints: Optional[List[int]],
+    configs: List[Config],
+    triton_meta,
+    heuristic_type,
+    filename=None,
+    inductor_meta=None,
+    custom_kernel=False,
+):
+    """
+    A copy of triton.autotune that calls our subclass.  Our subclass
+    has additional debugging, error handling, and on-disk caching.
+    """
+    configs = unique_configs(configs)
+    assert len(configs) == 1 or filename
+    inductor_meta = {} if inductor_meta is None else inductor_meta
+
+    disabled = inductor_meta.get("force_disable_caches", False)
+
+    # on disk caching logic and/or remote caching
+    autotune_cache = None
+    if (
+        not disabled
+        and filename is not None
+        and (len(configs) > 1 or inductor_meta.get("coordinate_descent_tuning"))
+    ):
+        configs_hash = hash_configs(configs)
+
+        autotune_cache = AutotuneCache.create(inductor_meta, filename, configs_hash)
+        if autotune_cache:
+            if best_config := autotune_cache.read_best(inductor_meta, configs):
+                configs = [best_config]
+
+    else:
+        if disabled:
+            log.debug("autotune caching is disabled by config.force_disable_caches")
+
+    mutated_arg_names = inductor_meta.pop("mutated_arg_names", ())
+
+    def decorator(fn):
+        # Remove XBLOCK from config if it's not a function argument.
+        # This way, coordinate descent tuning will not try to tune it.
+        #
+        # Context: When TritonKernel.no_x_dim is True, we hardcode XBLOCK to 1.
+        import inspect
+
+        if "XBLOCK" not in inspect.signature(fn.fn).parameters:
+            for tconfig in configs:
+                if "XBLOCK" in tconfig.kwargs:
+                    assert tconfig.kwargs["XBLOCK"] == 1
+                    tconfig.kwargs.pop("XBLOCK")
+
+        if inductor_meta.get("profile_bandwidth"):
+            return DebugAutotuner(
+                fn,
+                triton_meta=triton_meta,
+                inductor_meta=inductor_meta,
+                regex_filter=inductor_meta["profile_bandwidth_regex"],
+                with_profiler=inductor_meta[
+                    "profile_bandwidth_with_do_bench_using_profiling"
+                ],
+                configs=configs,
+                save_cache_hook=autotune_cache and autotune_cache.save,
+                mutated_arg_names=mutated_arg_names,
+                heuristic_type=heuristic_type,
+                size_hints=size_hints,
+                custom_kernel=custom_kernel,
+                filename=filename,
+            )
+        return CachingAutotuner(
+            fn,
+            triton_meta=triton_meta,
+            inductor_meta=inductor_meta,
+            configs=configs,
+            save_cache_hook=autotune_cache and autotune_cache.save,
+            mutated_arg_names=mutated_arg_names,
+            heuristic_type=heuristic_type,
+            size_hints=size_hints,
+            custom_kernel=custom_kernel,
+            filename=filename,
+        )
+
+    return decorator
+
+
+def unique_configs(configs: List[Config]):
+    """Remove duplicate configurations"""
+    seen = set()
+    pruned_configs = []
+
+    for cfg in configs:
+        key = triton_config_to_hashable(cfg)
+        if key not in seen:
+            seen.add(key)
+            pruned_configs.append(cfg)
+    return pruned_configs
+
+
+def check_config(cfg, *, xnumel=None, ynumel=None, znumel=None):
+    for numel, label in zip((xnumel, ynumel, znumel), "XYZ"):
+        if numel is None:
+            continue
+        block = cfg[f"{label}BLOCK"]
+        if numel == 1:
+            assert block == 1, (
+                f"TritonKernel.indexing assumes numel == 1 => BLOCK == 1"
+                f" but {label.lower()}numel=={numel} and {label}BLOCK={block} (cfg={cfg})."
+            )
+        max_block = TRITON_MAX_BLOCK[label]
+        max_block_str = f'config.triton.max_block["{label}"]'
+        assert max_block % block == 0, (
+            f"TritonKernel.indexing assumes {label}BLOCK divides {max_block_str}"
+            f" but {label}BLOCK={block} and {max_block_str}={max_block} (cfg={cfg})."
+        )
+
+
+def _num_warps(num_warps, max_num_warps=8, min_num_warps=2, register_intensive=False):
+    # On AMD GPU each warp has 64 lanes which is double the size on NV GPU,
+    # therefore using half the number of warps here correspondingly.
+    if torch.version.hip:
+        max_num_warps = (max_num_warps + 1) // 2
+        min_num_warps = (min_num_warps + 1) // 2
+    # persistent reduction is register intensive
+    if register_intensive:
+        max_num_warps = max_num_warps // 2
+    return next_power_of_2(min(max(num_warps, min_num_warps), max_num_warps))
+
+
+def _check_max_grid_x(size_hints, x, num_warps):
+    # Check if maxGridSize is exceeded - if so then must scale XBLOCK further
+    max_grid_x = 2147483647
+    warp_size = (
+        64 if torch.version.hip else 32
+    )  # TODO: query warp size once #129663 is merged
+    num_blocks = (size_hints[0] + x - 1) // x
+
+    while (num_blocks * num_warps * warp_size) > max_grid_x and x < size_hints[0]:
+        x *= 2  # Scale up XBLOCK if grid exceeds limits
+        num_blocks = num_blocks // 2
+        if x >= max_grid_x:
+            raise AssertionError(
+                "Reduction config exceeds cudaDeviceProp maxGridSize. Please raise a pytorch issue"
+            )
+    return x, num_blocks
+
+
+def triton_config(
+    size_hints,
+    x,
+    y=None,
+    z=None,
+    num_stages=1,
+    num_elements_per_warp=256,
+    min_elem_per_thread=0,
+) -> Config:
+    """
+    Construct a pointwise triton config with some adjustment heuristics
+    based on size_hints. Size_hints is a tuple of numels in each tile
+    dimension and will be rounded up to the nearest power of 2.
+
+    num_elements_per_warp is a suggestion for controlling how many warps
+    the triton config should contain. e.g.: if x=16, y=8, z=4 then
+    num_elements = 16*8*4 = 512. Then if we set num_elements_per_warp=128,
+    we'll launch 512 (elem) / 128 (elem/warp) = 4 warps. Note that it's
+    just a suggestion, and sometimes other adjustment heuristics will
+    override the num_elements_per_warp.
+
+    min_elem_per_thread controls the minimum number of elements
+    processed by each thread. It's always enforced.
+    """
+    # Ideally we want to read this from some device config
+
+    # for a 2d size_hints [a, b], a should be mapped to YBLOCK rather than XBLOCK
+    size_hints = list(reversed(size_hints))
+
+    maxGridSize = [2147483647, 65535, 65535]
+
+    target = conditional_product(x, y, z)
+    if conditional_product(*size_hints) < target:
+        target //= 8
+
+    # shrink sizes to size hints
+    x = min(x, size_hints[0])
+    if y:
+        y = min(y, size_hints[1])
+    if z:
+        z = min(z, size_hints[2])
+
+    # if we are below original block size, scale up where we can;
+    # or if the calculated grid size is larger than the limit, we bump up the corresponding dimension
+    while x < min(size_hints[0], TRITON_MAX_BLOCK["X"]) and (
+        x * maxGridSize[0] < size_hints[0] or conditional_product(x, y, z) < target
+    ):
+        x *= 2
+    while (
+        y
+        and y < min(size_hints[1], TRITON_MAX_BLOCK["Y"])
+        and (
+            y * maxGridSize[1] < size_hints[1] or conditional_product(x, y, z) < target
+        )
+    ):
+        y *= 2
+    while (
+        z
+        and z < min(size_hints[2], TRITON_MAX_BLOCK["Z"])
+        and (
+            z * maxGridSize[2] < size_hints[2] or conditional_product(x, y, z) < target
+        )
+    ):
+        z *= 2
+
+    num_warps = _num_warps(
+        conditional_product(x, y, z) // num_elements_per_warp, min_num_warps=1
+    )
+    # we are going to arrive at 2 warps only if bs was too small due to
+    # numel being too small. However to workaround some ptx bugs we still
+    # want at least 4 warps if there's enough elements per thread
+    # given that this is a rare situation, don't expect this to affect perf
+    # in general
+    # see https://github.com/pytorch/pytorch/pull/97950
+    if conditional_product(x, y, z) >= 128 and not torch.version.hip:
+        num_warps = max(num_warps, 4)
+    xnumel = size_hints[0]
+    ynumel = size_hints[1] if y else None
+    znumel = size_hints[2] if z else None
+
+    # Increase x to satisfy min_elem_per_thread requirements.
+    block_size = max(
+        conditional_product(x, y, z),
+        min_elem_per_thread * _NUM_THREADS_PER_WARP * num_warps,
+    )
+    x *= math.ceil(block_size / conditional_product(x, y, z))
+
+    x, _num_blocks = _check_max_grid_x(size_hints, x, num_warps)
+
+    cfg = {"XBLOCK": x}
+    if y:
+        cfg["YBLOCK"] = y
+    if z:
+        cfg["ZBLOCK"] = z
+    assert x <= TRITON_MAX_BLOCK["X"], f"increase TRITON_MAX_BLOCK['X'] to {x}"
+    check_config(cfg, xnumel=xnumel, ynumel=ynumel, znumel=znumel)
+    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
+
+
+def triton_config_reduction(
+    size_hints, x, r, num_stages=1, num_warps=None, register_intensive=False
+) -> Config:
+    """
+    Construct a reduction triton config with some adjustment heuristics
+    based on size_hints. Size_hints is a tuple of numels in each tile
+    dimension and will be rounded up to the nearest power of 2.
+    """
+
+    target = conditional_product(x, r)
+    if conditional_product(*size_hints) < target:
+        target //= 8
+
+    # shrink sizes to size hints
+    x = min(x, size_hints[0])
+    r = min(r, size_hints[1])
+
+    # if we are below original block size, scale up where we can
+    while x < size_hints[0] and conditional_product(x, r) < target:
+        x *= 2
+    while r < size_hints[1] and conditional_product(x, r) < target:
+        r *= 2
+
+    if num_warps is None:
+        num_warps = conditional_product(x, r) // 128
+    num_warps = _num_warps(
+        num_warps, max_num_warps=16, register_intensive=register_intensive
+    )
+
+    x, _num_blocks = _check_max_grid_x(size_hints, x, num_warps)
+
+    while conditional_product(x, r) > target:
+        if r == 1:
+            break
+        r = r // 2
+
+    cfg = {"XBLOCK": x, "RBLOCK": r}
+    check_config(cfg, xnumel=size_hints[0])
+    assert x <= TRITON_MAX_BLOCK["X"], f"increase TRITON_MAX_BLOCK['X'] to {x}"
+    assert r <= TRITON_MAX_BLOCK["R"], f"increase TRITON_MAX_BLOCK['r'] to {r}"
+    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
+
+
+def triton_config_tiled_reduction(size_hints, x, y, r, num_stages=1):
+    """
+    Construct a tile reduction triton config with some adjustment
+    heuristics based on size_hints. Size_hints is a tuple of numels in
+    each tile dimension and will be rounded up to the nearest power of 2.
+    """
+
+    target = conditional_product(x, y, r)
+    if conditional_product(*size_hints) < target:
+        target //= 8
+
+    # shrink sizes to size hints
+    x = min(x, size_hints[0])
+    y = min(y, size_hints[1])
+    r = min(r, size_hints[2])
+
+    # if we are below original block size, scale up where we can
+    while x < size_hints[0] and conditional_product(x, y, r) < target:
+        x *= 2
+    while r < size_hints[2] and conditional_product(x, y, r) < target:
+        r *= 2
+    while y < size_hints[1] and conditional_product(x, y, r) < target:
+        y *= 2
+
+    cfg = {"XBLOCK": x, "YBLOCK": y, "RBLOCK": r}
+    num_warps = _num_warps(conditional_product(x, y, r) // 256, min_num_warps=1)
+    check_config(cfg, xnumel=size_hints[0], ynumel=size_hints[1])
+    assert r <= TRITON_MAX_BLOCK["R"], f"increase TRITON_MAX_BLOCK['r'] to {r}"
+    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
+
+
+def pointwise(
+    size_hints,
+    triton_meta,
+    tile_hint=None,
+    filename=None,
+    min_elem_per_thread=0,
+    inductor_meta=None,
+):
+    """
+    Construct @triton.heuristics() based on size_hints.
+    """
+    inductor_meta = {} if inductor_meta is None else inductor_meta
+    assert not inductor_meta.get("no_x_dim")
+
+    numel = functools.reduce(operator.mul, size_hints)
+    bs = max(256, min(numel // 128, 1024))
+
+    hinted_configs = autotune_hints_to_configs(
+        inductor_meta.get("autotune_hints", set()), size_hints, bs
+    )
+
+    triton_config_with_settings = functools.partial(
+        triton_config, min_elem_per_thread=min_elem_per_thread
+    )
+
+    if len(size_hints) == 1:
+        if disable_pointwise_autotuning(inductor_meta) and not (
+            inductor_meta.get("max_autotune")
+            or inductor_meta.get("max_autotune_pointwise")
+        ):
+            return cached_autotune(
+                size_hints,
+                [triton_config_with_settings(size_hints, bs)],
+                triton_meta=triton_meta,
+                inductor_meta=inductor_meta,
+                heuristic_type=HeuristicType.POINTWISE,
+                filename=filename,
+            )
+        else:
+            return cached_autotune(
+                size_hints,
+                [
+                    triton_config_with_settings(
+                        size_hints, bs, num_elements_per_warp=256
+                    ),
+                    triton_config_with_settings(
+                        size_hints, bs // 2, num_elements_per_warp=64
+                    ),
+                    *hinted_configs,
+                ],
+                triton_meta=triton_meta,
+                inductor_meta=inductor_meta,
+                heuristic_type=HeuristicType.POINTWISE,
+                filename=filename,
+            )
+    if len(size_hints) == 2:
+        if (
+            disable_pointwise_autotuning(inductor_meta) or tile_hint == TileHint.SQUARE
+        ) and not (
+            inductor_meta.get("max_autotune")
+            or inductor_meta.get("max_autotune_pointwise")
+        ):
+            return cached_autotune(
+                size_hints,
+                [triton_config_with_settings(size_hints, 32, 32)],
+                triton_meta=triton_meta,
+                inductor_meta=inductor_meta,
+                heuristic_type=HeuristicType.POINTWISE,
+                filename=filename,
+            )
+        return cached_autotune(
+            size_hints,
+            [
+                triton_config_with_settings(size_hints, 32, 32),
+                triton_config_with_settings(size_hints, 64, 64),  # ~8% better for fp16
+                triton_config_with_settings(size_hints, 256, 16),
+                triton_config_with_settings(size_hints, 16, 256),
+                triton_config_with_settings(size_hints, bs, 1),
+                triton_config_with_settings(size_hints, 1, bs),
+                *hinted_configs,
+            ],
+            triton_meta=triton_meta,
+            inductor_meta=inductor_meta,
+            filename=filename,
+            heuristic_type=HeuristicType.POINTWISE,
+        )
+    if len(size_hints) == 3:
+        if disable_pointwise_autotuning(inductor_meta):
+            return cached_autotune(
+                size_hints,
+                [triton_config_with_settings(size_hints, 16, 16, 16)],
+                triton_meta=triton_meta,
+                inductor_meta=inductor_meta,
+                heuristic_type=HeuristicType.POINTWISE,
+                filename=filename,
+            )
+        return cached_autotune(
+            size_hints,
+            [
+                triton_config_with_settings(size_hints, 16, 16, 16),
+                triton_config_with_settings(size_hints, 64, 8, 8),
+                triton_config_with_settings(size_hints, 8, 64, 8),
+                triton_config_with_settings(size_hints, 8, 8, 64),
+                triton_config_with_settings(size_hints, bs, 1, 1),
+                triton_config_with_settings(size_hints, 1, bs, 1),
+                triton_config_with_settings(size_hints, 1, 1, bs),
+                *hinted_configs,
+            ],
+            triton_meta=triton_meta,
+            inductor_meta=inductor_meta,
+            filename=filename,
+            heuristic_type=HeuristicType.POINTWISE,
+        )
+    raise NotImplementedError(f"size_hints: {size_hints}")
+
+
+def _reduction_configs(
+    *, size_hints: List[int], inductor_meta: Dict[str, Any]
+) -> List[Config]:
+    reduction_hint = inductor_meta.get("reduction_hint", None)
+    assert len(size_hints) == 2
+    rnumel = size_hints[-1]
+
+    register_intensive = False
+    MAX_RBLOCK = 2048
+    if (
+        size_hints[0] >= 1024
+        and inductor_meta.get("num_load", 0) + inductor_meta.get("num_reduction", 0)
+        >= 10
+    ):
+        # A heuristics to reduce RBLOCK if a kernel potentially need many registers.
+        # Consider load and reduction since load need move data into registers and
+        # reduction needs an accumulator.
+        #
+        # The magic numbers are a bit arbitrary.
+        #
+        # We cannot rely on dynamically scaling down RBLOCK later, since sometimes
+        # triton makes it to use less registers with worse perf. Check:
+        # https://github.com/pytorch/pytorch/issues/126463
+        #
+        # The heuristic is a very simple one since registers can be reused. But
+        # hopefully it can be a good enough indicator.
+        MAX_RBLOCK = 1024
+        register_intensive = True
+
+    contiguous_config = triton_config_reduction(
+        size_hints,
+        1,
+        (rnumel if 256 <= rnumel < MAX_RBLOCK else MAX_RBLOCK),
+        register_intensive=register_intensive,
+    )
+    outer_config = triton_config_reduction(
+        size_hints, 64, 8, register_intensive=register_intensive
+    )
+    tiny_config = triton_config_reduction(
+        size_hints,
+        2 * (256 // rnumel) if rnumel <= 256 else 1,
+        min(rnumel, MAX_RBLOCK),
+        register_intensive=register_intensive,
+    )
+    if inductor_meta.get("max_autotune") or inductor_meta.get("max_autotune_pointwise"):
+        pass  # skip all these cases
+    elif reduction_hint == ReductionHint.INNER:
+        return [contiguous_config]
+    elif reduction_hint == ReductionHint.OUTER:
+        return [outer_config]
+    elif reduction_hint == ReductionHint.OUTER_TINY:
+        return [tiny_config]
+    if disable_pointwise_autotuning(inductor_meta):
+        return [triton_config_reduction(size_hints, 32, 128)]
+    return [
+        contiguous_config,
+        outer_config,
+        tiny_config,
+        triton_config_reduction(size_hints, 64, 64),
+        triton_config_reduction(size_hints, 8, 512),
+        # halve the XBLOCK/RBLOCK compared to outer_config
+        # TODO: this may only be beneficial when each iteration of the reduction
+        # is quite heavy. E.g. https://gist.github.com/shunting314/189a8ef69f90db9d614a823385147a72
+        triton_config_reduction(size_hints, 64, 4, num_warps=8),
+    ]
+
+
+def reduction(
+    size_hints,
+    reduction_hint=False,
+    triton_meta=None,
+    filename=None,
+    inductor_meta=None,
+):
+    """args to @triton.heuristics()"""
+    inductor_meta = {} if inductor_meta is None else inductor_meta
+    inductor_meta["reduction_hint"] = reduction_hint
+    if inductor_meta.get("no_x_dim"):
+        size_hints = [1, *size_hints[1:]]
+
+    assert triton_meta is not None
+    rnumel = size_hints[-1]
+    if len(size_hints) != 2:
+        raise NotImplementedError(f"size_hints: {size_hints}")
+
+    configs = _reduction_configs(size_hints=size_hints, inductor_meta=inductor_meta)
+    return cached_autotune(
+        size_hints,
+        configs=configs,
+        triton_meta=triton_meta,
+        inductor_meta=inductor_meta,
+        heuristic_type=HeuristicType.REDUCTION,
+        filename=filename,
+    )
+
+
+def persistent_reduction(
+    size_hints,
+    reduction_hint=False,
+    triton_meta=None,
+    filename=None,
+    inductor_meta=None,
+):
+    inductor_meta = {} if inductor_meta is None else inductor_meta
+    inductor_meta["reduction_hint"] = reduction_hint
+    if inductor_meta.get("no_x_dim"):
+        size_hints = [1, *size_hints[1:]]
+
+    xnumel, rnumel = size_hints
+
+    configs = [
+        triton_config_reduction(size_hints, xblock, rnumel, register_intensive=True)
+        for xblock in (1, 8, 32, 128)
+        if xblock == 1 or (rnumel * xblock <= 4096 and xblock <= xnumel)
+    ]
+
+    # TODO(jansel): we should be able to improve these heuristics
+    if reduction_hint == ReductionHint.INNER and rnumel >= 256:
+        configs = configs[:1]
+    elif reduction_hint == ReductionHint.OUTER:
+        configs = configs[-1:]
+    elif reduction_hint == ReductionHint.OUTER_TINY:
+        configs = [
+            triton_config_reduction(
+                size_hints, 2 * (256 // rnumel) if rnumel <= 256 else 1, rnumel
+            )
+        ]
+    for c in configs:
+        # we don't need RBLOCK for persistent reduction
+        c.kwargs.pop("RBLOCK")
+
+    if disable_pointwise_autotuning(inductor_meta):
+        configs = configs[:1]
+
+    return cached_autotune(
+        size_hints,
+        configs,
+        triton_meta=triton_meta,
+        inductor_meta=inductor_meta,
+        filename=filename,
+        heuristic_type=HeuristicType.PERSISTENT_REDUCTION,
+    )
+
+
+def split_scan(
+    size_hints,
+    reduction_hint=False,
+    triton_meta=None,
+    filename=None,
+    inductor_meta=None,
+):
+    """Heuristic for TritonSplitScanKernel"""
+    inductor_meta = {} if inductor_meta is None else inductor_meta
+    inductor_meta["reduction_hint"] = reduction_hint
+    if inductor_meta.get("no_x_dim"):
+        size_hints = [1, *size_hints[1:]]
+
+    assert triton_meta is not None
+    if len(size_hints) != 2:
+        raise NotImplementedError(f"size_hints: {size_hints}")
+
+    configs = _reduction_configs(size_hints=size_hints, inductor_meta=inductor_meta)
+
+    # Fixup configs to enforce the minimum RBLOCK size
+    min_rblock = inductor_meta.get("min_split_scan_rblock", 256)
+    for cfg in configs:
+        if cfg.kwargs["RBLOCK"] < min_rblock:
+            cfg.kwargs["RBLOCK"] = min_rblock
+
+    return cached_autotune(
+        size_hints,
+        configs=configs,
+        triton_meta=triton_meta,
+        inductor_meta=inductor_meta,
+        heuristic_type=HeuristicType.SPLIT_SCAN,
+        filename=filename,
+    )
+
+
+def template(num_stages, num_warps, triton_meta, filename=None, inductor_meta=None):
+    """
+    Compile a triton template
+    """
+    return cached_autotune(
+        None,
+        [triton.Config({}, num_stages=num_stages, num_warps=num_warps)],
+        triton_meta=triton_meta,
+        inductor_meta=inductor_meta,
+        heuristic_type=HeuristicType.TEMPLATE,
+        filename=filename,
+    )
+
+
+def user_autotune(
+    configs, triton_meta, filename=None, inductor_meta=None, custom_kernel=False
+):
+    """
+    Compile a user defined triton kernel
+    """
+    defaults = inspect.signature(triton.Config).parameters
+    default_num_stages = defaults["num_stages"].default
+    default_num_warps = defaults["num_warps"].default
+
+    if len(configs) == 0:
+        configs = [
+            triton.Config(
+                {}, num_stages=default_num_stages, num_warps=default_num_warps
+            )
+        ]
+    else:
+        configs = [
+            triton.Config(
+                c.get("kwargs", {}),
+                num_stages=c.get("num_stages", default_num_stages),
+                num_warps=c.get("num_warps", default_num_warps),
+            )
+            for c in configs
+        ]
+
+    return cached_autotune(
+        None,
+        configs,
+        triton_meta=triton_meta,
+        heuristic_type=HeuristicType.USER_AUTOTUNE,
+        filename=filename,
+        inductor_meta=inductor_meta,
+        custom_kernel=custom_kernel,
+    )
+
+
+def foreach(triton_meta, num_warps, filename=None, inductor_meta=None):
+    """
+    Compile a triton foreach kernel
+    """
+    return cached_autotune(
+        None,
+        [triton.Config({}, num_stages=1, num_warps=num_warps)],
+        triton_meta=triton_meta,
+        inductor_meta=inductor_meta,
+        heuristic_type=HeuristicType.TEMPLATE,
+        filename=filename,
+    )
+
+
+def grid(*numels):
+    """Helper function to compute triton grids"""
+    if len(numels) == 1:
+        xnumel, ynumel, znumel = numels[0], None, None
+    elif len(numels) == 2:
+        xnumel, ynumel, znumel = numels[1], numels[0], None
+    elif len(numels) == 3:
+        xnumel, ynumel, znumel = numels[2], numels[1], numels[0]
+    else:
+        raise AssertionError(f"invalid size for numels {len(numels)}")
+
+    def get_grid_dim(numel, block):
+        if numel is None:
+            return 1
+        if block is None:
+            return numel
+        return ceildiv(numel, block)
+
+    def grid_fn(meta):
+        x_grid = get_grid_dim(xnumel, meta.get("XBLOCK", 1))
+        y_grid = get_grid_dim(ynumel, meta.get("YBLOCK", None))
+
+        max_y_grid = get_max_y_grid()
+        if znumel is None:
+            div = ceildiv(y_grid, max_y_grid)
+            y_grid = ceildiv(y_grid, div)
+            z_grid = div
+        else:
+            z_grid = get_grid_dim(znumel, meta.get("ZBLOCK", None))
+            torch._check(
+                y_grid <= max_y_grid,
+                lambda: f"Generated y grid beyond 2^16 ({y_grid}) not supported with z dimension present. File issue",
+            )
+
+        return (
+            x_grid,
+            y_grid,
+            z_grid,
+        )
+
+    setattr(grid_fn, "grid_fn_str", f"grid{numels}")  # noqa: B010
+
+    return grid_fn
+
+
+def split_scan_grid(xnumel, rnumel):
+    def grid_fn(meta):
+        assert meta.get("XBLOCK", 1) == 1
+        return (ceildiv(rnumel, meta.get("RBLOCK", 1)), xnumel, 1)
+
+    grid_fn_str = f"split_scan_grid({xnumel}, {rnumel})"
+    setattr(grid_fn, "grid_fn_str", grid_fn_str)  # noqa: B010
+
+    return grid_fn
+
+
+def grid_combo_kernels(
+    *numels, num_kernels, min_blocks, is_sequential, default_meta=None
+):
+    """min_blocks is the minimal size of the grid x dimension"""
+    if not is_sequential:
+        # round robin dispatch
+        numels_agg = list(numels)
+        for i in range(len(numels_agg)):
+            if isinstance(numels_agg[i], (list, tuple)):
+                numels_agg[i] = max(max(numels_agg[i]), 0)  # noqa: PLW3301
+        kernel_grid_fn = grid(*numels_agg)
+
+        if isinstance(numels[-1], (list, tuple)):
+            min_blocks_d = max(-min(numels[-1]), 0) * num_kernels
+        else:
+            min_blocks_d = None
+        if min_blocks is None:
+            assert min_blocks_d is not None
+            min_blocks = min_blocks_d
+        else:
+            assert (
+                min_blocks_d is None or min_blocks == min_blocks_d
+            ), f"inconsistent min_blocks {min_blocks} vs  x grid {numels[-1]}"
+    else:
+        # sequential dispatch
+        seq_numels = list(numels)
+        # x numels are not used here, just a place holder
+        seq_numels[-1] = 1024
+        for i in range(len(seq_numels) - 1):
+            if isinstance(seq_numels[i], (list, tuple)):
+                seq_numels[i] = max(seq_numels[i])
+
+        kernel_grid_fn = grid(*seq_numels)
+
+    def get_grid_dim(numel, block):
+        if numel is None:
+            return 1
+        if block is None:
+            return numel
+        return ceildiv(numel, block)
+
+    def grid_fn(meta):
+        assert min_blocks is not None, "min_blocks must be a number"
+        cuda_grid = list(kernel_grid_fn(meta))
+        cuda_grid[0] = max(num_kernels * cuda_grid[0], min_blocks)
+        return tuple(cuda_grid)
+
+    def seq_grid_fn(meta):
+        cuda_grid = list(kernel_grid_fn(meta))
+        # x <= 0 means this kernel's x grid is not tunable (x_no_dim is true)
+        x_grid = sum(
+            [
+                -x if x <= 0 else get_grid_dim(x, meta.get("XBLOCK", 1))
+                for x in numels[-1]
+            ]
+        )
+        cuda_grid[0] = x_grid
+        return tuple(cuda_grid)
+
+    def grid_fn_default_meta(meta):
+        return grid_fn(default_meta)
+
+    def seq_grid_fn_default_meta(meta):
+        return seq_grid_fn(default_meta)
+
+    if default_meta is None:
+        return grid_fn if not is_sequential else seq_grid_fn
+    else:
+        return grid_fn_default_meta if not is_sequential else seq_grid_fn_default_meta
diff --git a/.venv/lib/python3.11/site-packages/torch/include/dnnl_config.h b/.venv/lib/python3.11/site-packages/torch/include/dnnl_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..48925e1e3ab49ae135c6e9c4c501aa2f5e030913
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/dnnl_config.h
@@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_CONFIG_H
+#define DNNL_CONFIG_H
+
+#include "oneapi/dnnl/dnnl_config.h"
+
+#endif /* DNNL_CONFIG_H */
diff --git a/.venv/lib/python3.11/site-packages/torch/include/dnnl_debug.h b/.venv/lib/python3.11/site-packages/torch/include/dnnl_debug.h
new file mode 100644
index 0000000000000000000000000000000000000000..5044971832bbbe56127920a527508b207a803eea
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/dnnl_debug.h
@@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_DEBUG_H
+#define DNNL_DEBUG_H
+
+#include "oneapi/dnnl/dnnl_debug.h"
+
+#endif /* DNNL_DEBUG_H */
diff --git a/.venv/lib/python3.11/site-packages/torch/include/dnnl_ocl.h b/.venv/lib/python3.11/site-packages/torch/include/dnnl_ocl.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad731150b28babe7bd5a911acd8de70c57e85254
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/dnnl_ocl.h
@@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_OCL_H
+#define DNNL_OCL_H
+
+#include "oneapi/dnnl/dnnl_ocl.h"
+
+#endif /* DNNL_OCL_H */
diff --git a/.venv/lib/python3.11/site-packages/torch/include/dnnl_sycl.h b/.venv/lib/python3.11/site-packages/torch/include/dnnl_sycl.h
new file mode 100644
index 0000000000000000000000000000000000000000..4501598c2f461021f0fa818e95fd1972ce2d3ace
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/dnnl_sycl.h
@@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_SYCL_H
+#define DNNL_SYCL_H
+
+#include "oneapi/dnnl/dnnl_sycl.h"
+
+#endif /* DNNL_SYCL_H */
diff --git a/.venv/lib/python3.11/site-packages/torch/include/dnnl_threadpool.h b/.venv/lib/python3.11/site-packages/torch/include/dnnl_threadpool.h
new file mode 100644
index 0000000000000000000000000000000000000000..e27e584a65ed16740d4fde93da3a1a049dd111aa
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/dnnl_threadpool.h
@@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_THREADPOOL_H
+#define DNNL_THREADPOOL_H
+
+#include "oneapi/dnnl/dnnl_threadpool.h"
+
+#endif /* DNNL_THREADPOOL_H */
diff --git a/.venv/lib/python3.11/site-packages/torch/include/dnnl_types.h b/.venv/lib/python3.11/site-packages/torch/include/dnnl_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f4261b712dc37ec2416ba60c0c68bb30f6995e0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/dnnl_types.h
@@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_TYPES_H
+#define DNNL_TYPES_H
+
+#include "oneapi/dnnl/dnnl_types.h"
+
+#endif /* DNNL_TYPES_H */
diff --git a/.venv/lib/python3.11/site-packages/torch/include/dnnl_version.h b/.venv/lib/python3.11/site-packages/torch/include/dnnl_version.h
new file mode 100644
index 0000000000000000000000000000000000000000..32a3d5cf839b1d593f069520febfd60b323730e9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/dnnl_version.h
@@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_VERSION_H
+#define DNNL_VERSION_H
+
+#include "oneapi/dnnl/dnnl_version.h"
+
+#endif /* DNNL_VERSION_H */
diff --git a/.venv/lib/python3.11/site-packages/torch/include/experiments-config.h b/.venv/lib/python3.11/site-packages/torch/include/experiments-config.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c0cba4acdaef0784e7b96bfd6e755254d3eecb4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/experiments-config.h
@@ -0,0 +1,25 @@
+// Copyright 2023 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct xnn_experiment_config {
+  bool adaptive_avx_optimization;
+};
+
+struct xnn_experiment_config* xnn_get_experiment_config();
+
+void xnn_experiment_enable_adaptive_avx_optimization();
+
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/.venv/lib/python3.11/site-packages/torch/include/fp16.h b/.venv/lib/python3.11/site-packages/torch/include/fp16.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d7366e997dadef17922225bcbb489288f6f9cdc
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/fp16.h
@@ -0,0 +1,11 @@
+#pragma once
+#ifndef FP16_H
+#define FP16_H
+
+#include <fp16/fp16.h>
+
+#if defined(PSIMD_H)
+#include <fp16/psimd.h>
+#endif
+
+#endif /* FP16_H */
diff --git a/.venv/lib/python3.11/site-packages/torch/include/fxdiv.h b/.venv/lib/python3.11/site-packages/torch/include/fxdiv.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c35038d97c55c524bb97caba2e3560cab9da504
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/fxdiv.h
@@ -0,0 +1,425 @@
+#pragma once
+#ifndef FXDIV_H
+#define FXDIV_H
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+	#include <cstddef>
+	#include <cstdint>
+	#include <climits>
+#elif !defined(__OPENCL_VERSION__)
+	#include <stddef.h>
+	#include <stdint.h>
+	#include <limits.h>
+#endif
+
+#if defined(_MSC_VER)
+	#include <intrin.h>
+	#if defined(_M_IX86) || defined(_M_X64)
+		#include <immintrin.h>
+	#endif
+#endif
+
+#ifndef FXDIV_USE_INLINE_ASSEMBLY
+	#define FXDIV_USE_INLINE_ASSEMBLY 0
+#endif
+
+static inline uint64_t fxdiv_mulext_uint32_t(uint32_t a, uint32_t b) {
+#if defined(_MSC_VER) && defined(_M_IX86)
+	return (uint64_t) __emulu((unsigned int) a, (unsigned int) b);
+#else
+	return (uint64_t) a * (uint64_t) b;
+#endif
+}
+
+static inline uint32_t fxdiv_mulhi_uint32_t(uint32_t a, uint32_t b) {
+#if defined(__OPENCL_VERSION__)
+	return mul_hi(a, b);
+#elif defined(__CUDA_ARCH__)
+	return (uint32_t) __umulhi((unsigned int) a, (unsigned int) b);
+#elif defined(_MSC_VER) && defined(_M_IX86)
+	return (uint32_t) (__emulu((unsigned int) a, (unsigned int) b) >> 32);
+#elif defined(_MSC_VER) && defined(_M_ARM)
+	return (uint32_t) _MulUnsignedHigh((unsigned long) a, (unsigned long) b);
+#else
+	return (uint32_t) (((uint64_t) a * (uint64_t) b) >> 32);
+#endif
+}
+
+static inline uint64_t fxdiv_mulhi_uint64_t(uint64_t a, uint64_t b) {
+#if defined(__OPENCL_VERSION__)
+	return mul_hi(a, b);
+#elif defined(__CUDA_ARCH__)
+	return (uint64_t) __umul64hi((unsigned long long) a, (unsigned long long) b);
+#elif defined(_MSC_VER) && defined(_M_X64)
+	return (uint64_t) __umulh((unsigned __int64) a, (unsigned __int64) b);
+#elif defined(__GNUC__) && defined(__SIZEOF_INT128__)
+	return (uint64_t) (((((unsigned __int128) a) * ((unsigned __int128) b))) >> 64);
+#else
+	const uint32_t a_lo = (uint32_t) a;
+	const uint32_t a_hi = (uint32_t) (a >> 32);
+	const uint32_t b_lo = (uint32_t) b;
+	const uint32_t b_hi = (uint32_t) (b >> 32);
+
+	const uint64_t t = fxdiv_mulext_uint32_t(a_hi, b_lo) +
+		(uint64_t) fxdiv_mulhi_uint32_t(a_lo, b_lo);
+	return fxdiv_mulext_uint32_t(a_hi, b_hi) + (t >> 32) +
+		((fxdiv_mulext_uint32_t(a_lo, b_hi) + (uint64_t) (uint32_t) t) >> 32);
+#endif
+}
+
+static inline size_t fxdiv_mulhi_size_t(size_t a, size_t b) {
+#if SIZE_MAX == UINT32_MAX
+	return (size_t) fxdiv_mulhi_uint32_t((uint32_t) a, (uint32_t) b);
+#elif SIZE_MAX == UINT64_MAX
+	return (size_t) fxdiv_mulhi_uint64_t((uint64_t) a, (uint64_t) b);
+#else
+	#error Unsupported platform
+#endif
+}
+
+struct fxdiv_divisor_uint32_t {
+	uint32_t value;
+	uint32_t m;
+	uint8_t s1;
+	uint8_t s2;
+};
+
+struct fxdiv_result_uint32_t {
+	uint32_t quotient;
+	uint32_t remainder;
+};
+
+struct fxdiv_divisor_uint64_t {
+	uint64_t value;
+	uint64_t m;
+	uint8_t s1;
+	uint8_t s2;
+};
+
+struct fxdiv_result_uint64_t {
+	uint64_t quotient;
+	uint64_t remainder;
+};
+
+struct fxdiv_divisor_size_t {
+	size_t value;
+	size_t m;
+	uint8_t s1;
+	uint8_t s2;
+};
+
+struct fxdiv_result_size_t {
+	size_t quotient;
+	size_t remainder;
+};
+
+static inline struct fxdiv_divisor_uint32_t fxdiv_init_uint32_t(uint32_t d) {
+	struct fxdiv_divisor_uint32_t result = { d };
+	if (d == 1) {
+		result.m = UINT32_C(1);
+		result.s1 = 0;
+		result.s2 = 0;
+	} else {
+		#if defined(__OPENCL_VERSION__)
+			const uint32_t l_minus_1 = 31 - clz(d - 1);
+		#elif defined(__CUDA_ARCH__)
+			const uint32_t l_minus_1 = 31 - __clz((int) (d - 1));
+		#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64))
+			unsigned long l_minus_1;
+			_BitScanReverse(&l_minus_1, (unsigned long) (d - 1));
+		#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && FXDIV_USE_INLINE_ASSEMBLY
+			uint32_t l_minus_1;
+			__asm__("BSRL %[d_minus_1], %[l_minus_1]"
+				: [l_minus_1] "=r" (l_minus_1)
+				: [d_minus_1] "r" (d - 1)
+				: "cc");
+		#elif defined(__GNUC__)
+			const uint32_t l_minus_1 = 31 - __builtin_clz(d - 1);
+		#else
+			/* Based on Algorithm 2 from Hacker's delight */
+
+			uint32_t l_minus_1 = 0;
+			uint32_t x = d - 1;
+			uint32_t y = x >> 16;
+			if (y != 0) {
+				l_minus_1 += 16;
+				x = y;
+			}
+			y = x >> 8;
+			if (y != 0) {
+				l_minus_1 += 8;
+				x = y;
+			}
+			y = x >> 4;
+			if (y != 0) {
+				l_minus_1 += 4;
+				x = y;
+			}
+			y = x >> 2;
+			if (y != 0) {
+				l_minus_1 += 2;
+				x = y;
+			}
+			if ((x & 2) != 0) {
+				l_minus_1 += 1;
+			}
+		#endif
+		uint32_t u_hi = (UINT32_C(2) << (uint32_t) l_minus_1) - d;
+
+		/* Division of 64-bit number u_hi:UINT32_C(0) by 32-bit number d, 32-bit quotient output q */
+		#if defined(__GNUC__) && defined(__i386__) && FXDIV_USE_INLINE_ASSEMBLY
+			uint32_t q;
+			__asm__("DIVL %[d]"
+				: "=a" (q), "+d" (u_hi)
+				: [d] "r" (d), "a" (0)
+				: "cc");
+		#elif (defined(_MSC_VER) && _MSC_VER >= 1920) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (defined(_M_IX86) || defined(_M_X64))
+			unsigned int remainder;
+			const uint32_t q = (uint32_t) _udiv64((unsigned __int64) ((uint64_t) u_hi << 32), (unsigned int) d, &remainder);
+		#else
+			const uint32_t q = ((uint64_t) u_hi << 32) / d;
+		#endif
+
+		result.m = q + UINT32_C(1);
+		result.s1 = 1;
+		result.s2 = (uint8_t) l_minus_1;
+	}
+	return result;
+}
+
+static inline struct fxdiv_divisor_uint64_t fxdiv_init_uint64_t(uint64_t d) {
+	struct fxdiv_divisor_uint64_t result = { d };
+	if (d == 1) {
+		result.m = UINT64_C(1);
+		result.s1 = 0;
+		result.s2 = 0;
+	} else {
+		#if defined(__OPENCL_VERSION__)
+			const uint32_t nlz_d = clz(d);
+			const uint32_t l_minus_1 = 63 - clz(d - 1);
+		#elif defined(__CUDA_ARCH__)
+			const uint32_t nlz_d = __clzll((long long) d);
+			const uint32_t l_minus_1 = 63 - __clzll((long long) (d - 1));
+		#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
+			unsigned long l_minus_1;
+			_BitScanReverse64(&l_minus_1, (unsigned __int64) (d - 1));
+			unsigned long bsr_d;
+			_BitScanReverse64(&bsr_d, (unsigned __int64) d);
+			const uint32_t nlz_d = bsr_d ^ 0x3F;
+		#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_ARM))
+			const uint64_t d_minus_1 = d - 1;
+			const uint8_t d_is_power_of_2 = (d & d_minus_1) == 0;
+			unsigned long l_minus_1;
+			if ((uint32_t) (d_minus_1 >> 32) == 0) {
+				_BitScanReverse(&l_minus_1, (unsigned long) d_minus_1);
+			} else {
+				_BitScanReverse(&l_minus_1, (unsigned long) (uint32_t) (d_minus_1 >> 32));
+				l_minus_1 += 32;
+			}
+			const uint32_t nlz_d = ((uint8_t) l_minus_1 ^ UINT8_C(0x3F)) - d_is_power_of_2;
+		#elif defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
+			uint64_t l_minus_1;
+			__asm__("BSRQ %[d_minus_1], %[l_minus_1]"
+				: [l_minus_1] "=r" (l_minus_1)
+				: [d_minus_1] "r" (d - 1)
+				: "cc");
+		#elif defined(__GNUC__)
+			const uint32_t l_minus_1 = 63 - __builtin_clzll(d - 1);
+			const uint32_t nlz_d = __builtin_clzll(d);
+		#else
+			/* Based on Algorithm 2 from Hacker's delight */
+			const uint64_t d_minus_1 = d - 1;
+			const uint32_t d_is_power_of_2 = (d & d_minus_1) == 0;
+			uint32_t l_minus_1 = 0;
+			uint32_t x = (uint32_t) d_minus_1;
+			uint32_t y = d_minus_1 >> 32;
+			if (y != 0) {
+				l_minus_1 += 32;
+				x = y;
+			}
+			y = x >> 16;
+			if (y != 0) {
+				l_minus_1 += 16;
+				x = y;
+			}
+			y = x >> 8;
+			if (y != 0) {
+				l_minus_1 += 8;
+				x = y;
+			}
+			y = x >> 4;
+			if (y != 0) {
+				l_minus_1 += 4;
+				x = y;
+			}
+			y = x >> 2;
+			if (y != 0) {
+				l_minus_1 += 2;
+				x = y;
+			}
+			if ((x & 2) != 0) {
+				l_minus_1 += 1;
+			}
+			const uint32_t nlz_d = (l_minus_1 ^ UINT32_C(0x3F)) - d_is_power_of_2;
+		#endif
+		uint64_t u_hi = (UINT64_C(2) << (uint32_t) l_minus_1) - d;
+
+		/* Division of 128-bit number u_hi:UINT64_C(0) by 64-bit number d, 64-bit quotient output q */
+		#if defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
+			uint64_t q;
+			__asm__("DIVQ %[d]"
+				: "=a" (q), "+d" (u_hi)
+				: [d] "r" (d), "a" (UINT64_C(0))
+				: "cc");
+		#elif 0 && defined(__GNUC__) && defined(__SIZEOF_INT128__)
+			/* GCC, Clang, and Intel Compiler fail to inline optimized implementation and call into support library for 128-bit division */
+			const uint64_t q = (uint64_t) (((unsigned __int128) u_hi << 64) / ((unsigned __int128) d));
+		#elif (defined(_MSC_VER) && _MSC_VER >= 1920) && !defined(__clang__) && !defined(__INTEL_COMPILER) && defined(_M_X64)
+			unsigned __int64 remainder;
+			const uint64_t q = (uint64_t) _udiv128((unsigned __int64) u_hi, 0, (unsigned __int64) d, &remainder);
+		#else
+			/* Implementation based on code from Hacker's delight */
+
+			/* Normalize divisor and shift divident left */
+			d <<= nlz_d;
+			u_hi <<= nlz_d;
+			/* Break divisor up into two 32-bit digits */
+			const uint64_t d_hi = (uint32_t) (d >> 32);
+			const uint32_t d_lo = (uint32_t) d;
+
+			/* Compute the first quotient digit, q1 */
+			uint64_t q1 = u_hi / d_hi;
+			uint64_t r1 = u_hi - q1 * d_hi;
+
+			while ((q1 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q1, d_lo) > (r1 << 32)) {
+				q1 -= 1;
+				r1 += d_hi;
+				if ((r1 >> 32) != 0) {
+					break;
+				}
+			}
+
+			/* Multiply and subtract. */
+			u_hi = (u_hi << 32) - q1 * d;
+
+			/* Compute the second quotient digit, q0 */
+			uint64_t q0 = u_hi / d_hi;
+			uint64_t r0 = u_hi - q0 * d_hi;
+
+			while ((q0 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q0, d_lo) > (r0 << 32)) {
+				q0 -= 1;
+				r0 += d_hi;
+				if ((r0 >> 32) != 0) {
+					break;
+				}
+			}
+			const uint64_t q = (q1 << 32) | (uint32_t) q0;
+		#endif
+		result.m = q + UINT64_C(1);
+		result.s1 = 1;
+		result.s2 = (uint8_t) l_minus_1;
+	}
+	return result;
+}
+
+static inline struct fxdiv_divisor_size_t fxdiv_init_size_t(size_t d) {
+#if SIZE_MAX == UINT32_MAX
+	const struct fxdiv_divisor_uint32_t uint_result = fxdiv_init_uint32_t((uint32_t) d);
+#elif SIZE_MAX == UINT64_MAX
+	const struct fxdiv_divisor_uint64_t uint_result = fxdiv_init_uint64_t((uint64_t) d);
+#else
+	#error Unsupported platform
+#endif
+	struct fxdiv_divisor_size_t size_result = {
+		(size_t) uint_result.value,
+		(size_t) uint_result.m,
+		uint_result.s1,
+		uint_result.s2
+	};
+	return size_result;
+}
+
+static inline uint32_t fxdiv_quotient_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
+	const uint32_t t = fxdiv_mulhi_uint32_t(n, divisor.m);
+	return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
+}
+
+static inline uint64_t fxdiv_quotient_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
+	const uint64_t t = fxdiv_mulhi_uint64_t(n, divisor.m);
+	return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
+}
+
+static inline size_t fxdiv_quotient_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
+#if SIZE_MAX == UINT32_MAX
+	const struct fxdiv_divisor_uint32_t uint32_divisor = {
+		(uint32_t) divisor.value,
+		(uint32_t) divisor.m,
+		divisor.s1,
+		divisor.s2
+	};
+	return fxdiv_quotient_uint32_t((uint32_t) n, uint32_divisor);
+#elif SIZE_MAX == UINT64_MAX
+	const struct fxdiv_divisor_uint64_t uint64_divisor = {
+		(uint64_t) divisor.value,
+		(uint64_t) divisor.m,
+		divisor.s1,
+		divisor.s2
+	};
+	return fxdiv_quotient_uint64_t((uint64_t) n, uint64_divisor);
+#else
+	#error Unsupported platform
+#endif
+}
+
+static inline uint32_t fxdiv_remainder_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
+	const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
+	return n - quotient * divisor.value;
+}
+
+static inline uint64_t fxdiv_remainder_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
+	const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
+	return n - quotient * divisor.value;
+}
+
+static inline size_t fxdiv_remainder_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
+	const size_t quotient = fxdiv_quotient_size_t(n, divisor);
+	return n - quotient * divisor.value;
+}
+
+static inline uint32_t fxdiv_round_down_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t granularity) {
+	const uint32_t quotient = fxdiv_quotient_uint32_t(n, granularity);
+	return quotient * granularity.value;
+}
+
+static inline uint64_t fxdiv_round_down_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t granularity) {
+	const uint64_t quotient = fxdiv_quotient_uint64_t(n, granularity);
+	return quotient * granularity.value;
+}
+
+static inline size_t fxdiv_round_down_size_t(size_t n, const struct fxdiv_divisor_size_t granularity) {
+	const size_t quotient = fxdiv_quotient_size_t(n, granularity);
+	return quotient * granularity.value;
+}
+
+static inline struct fxdiv_result_uint32_t fxdiv_divide_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
+	const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
+	const uint32_t remainder = n - quotient * divisor.value;
+	struct fxdiv_result_uint32_t result = { quotient, remainder };
+	return result;
+}
+
+static inline struct fxdiv_result_uint64_t fxdiv_divide_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
+	const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
+	const uint64_t remainder = n - quotient * divisor.value;
+	struct fxdiv_result_uint64_t result = { quotient, remainder };
+	return result;
+}
+
+static inline struct fxdiv_result_size_t fxdiv_divide_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
+	const size_t quotient = fxdiv_quotient_size_t(n, divisor);
+	const size_t remainder = n - quotient * divisor.value;
+	struct fxdiv_result_size_t result = { quotient, remainder };
+	return result;
+}
+
+#endif /* FXDIV_H */
diff --git a/.venv/lib/python3.11/site-packages/torch/include/libshm.h b/.venv/lib/python3.11/site-packages/torch/include/libshm.h
new file mode 100644
index 0000000000000000000000000000000000000000..28024aa2338d1f46ce280abeb92a633f89be1385
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/libshm.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <ATen/MapAllocator.h>
+
+#ifdef __cplusplus
+
+void libshm_init(const char* manager_exec_path);
+
+// Superclass to run a constructor before at::RefcountedMapAllocator
+class THManagedMapAllocatorInit {
+ protected:
+  THManagedMapAllocatorInit(const char* manager_handle, const char* filename);
+  std::string manager_handle_;
+};
+
+// Like a at::RefcountedMapAllocator, but it also makes use of an external
+// shared memory manager process to ensure that shared memory regions actually
+// get freed in the end (even if processes lose the memory).
+class THManagedMapAllocator : private THManagedMapAllocatorInit,
+                              public at::RefcountedMapAllocator {
+ public:
+  THManagedMapAllocator(
+      const char* manager_handle,
+      const char* filename,
+      int flags,
+      size_t size);
+
+  void close() override;
+
+  ~THManagedMapAllocator() override {
+    close();
+  }
+
+  static at::DataPtr makeDataPtr(
+      const char* manager_handle,
+      const char* filename,
+      int flags,
+      size_t size);
+  static THManagedMapAllocator* fromDataPtr(const at::DataPtr&);
+
+  const char* manager_handle() const {
+    return manager_handle_.c_str();
+  }
+};
+
+#endif
diff --git a/.venv/lib/python3.11/site-packages/torch/include/nnpack.h b/.venv/lib/python3.11/site-packages/torch/include/nnpack.h
new file mode 100644
index 0000000000000000000000000000000000000000..97b5ff390076e9ab7ae91e67bfc0d78736aaeffd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/nnpack.h
@@ -0,0 +1,659 @@
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <pthreadpool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Status code for any NNPACK function call.
+ */
+enum nnp_status {
+	/** The call succeeded, and all output arguments now contain valid data. */
+	nnp_status_success = 0,
+	/** NNPACK function was called with batch_size == 0. */
+	nnp_status_invalid_batch_size = 2,
+	/** NNPACK function was called with channels == 0. */
+	nnp_status_invalid_channels = 3,
+	/** NNPACK function was called with input_channels == 0. */
+	nnp_status_invalid_input_channels = 4,
+	/** NNPACK function was called with output_channels == 0. */
+	nnp_status_invalid_output_channels = 5,
+	/** NNPACK function was called with input_size.height == 0 or input_size.width == 0 */
+	nnp_status_invalid_input_size = 10,
+	/** NNPACK function was called with input_stride.height == 0 or input_stride.width == 0 */
+	nnp_status_invalid_input_stride = 11,
+	/** NNPACK function was called with input_padding not less than respective kernel (or pooling) size, i.e.:
+	 *
+	 *  - input_padding.left   >= kernel_size.width  (>= pooling_size.width)
+	 *  - input_padding.right  >= kernel_size.width  (>= pooling_size.width)
+	 *  - input_padding.top    >= kernel_size.height (>= pooling_size.height)
+	 *  - input_padding.bottom >= kernel_size.height (>= pooling_size.height)
+	 */
+	nnp_status_invalid_input_padding = 12,
+	/** NNPACK function was called with kernel_size.height == 0 or kernel_size.width == 0 */
+	nnp_status_invalid_kernel_size = 13,
+	/** NNPACK function was called with pooling_size.height == 0 or pooling_size.width == 0 */
+	nnp_status_invalid_pooling_size = 14,
+	/** NNPACK function was called with pooling_stride.height == 0 or pooling_stride.width == 0 */
+	nnp_status_invalid_pooling_stride = 15,
+	/** NNPACK function was called with convolution algorithm not in nnp_convolution_algorithm enumeration */
+	nnp_status_invalid_algorithm = 16,
+	/** NNPACK function was called with convolution transform strategy not in nnp_convolution_transform_strategy enum */
+	nnp_status_invalid_transform_strategy = 17,
+	/** NNPACK function was called with output_subsampling.height == 0 or output_subsampling.width == 0 */
+	nnp_status_invalid_output_subsampling = 13,
+	/** NNPACK function was called with activation not in nnp_activation enum */
+	nnp_status_invalid_activation = 14,
+	/** NNPACK function was called with invalid activation parameters */
+	nnp_status_invalid_activation_parameters = 15,
+
+	/** NNPACK does not support the particular input size for the function */
+	nnp_status_unsupported_input_size = 20,
+	/** NNPACK does not support the particular input stride for the function */
+	nnp_status_unsupported_input_stride = 21,
+	/** NNPACK does not support the particular input padding for the function */
+	nnp_status_unsupported_input_padding = 22,
+	/** NNPACK does not support the particular kernel size for the function */
+	nnp_status_unsupported_kernel_size = 23,
+	/** NNPACK does not support the particular pooling size for the function */
+	nnp_status_unsupported_pooling_size = 24,
+	/** NNPACK does not support the particular pooling stride for the function */
+	nnp_status_unsupported_pooling_stride = 25,
+	/** NNPACK does not support the particular convolution algorithm for the function */
+	nnp_status_unsupported_algorithm = 26,
+	/** NNPACK does not support the particular convolution transform strategy for the algorithm */
+	nnp_status_unsupported_transform_strategy = 27,
+	/** NNPACK does not support the particular activation function for the function */
+	nnp_status_unsupported_activation = 28,
+	/** NNPACK does not support the particular activation function parameters for the function */
+	nnp_status_unsupported_activation_parameters = 29, 
+
+	/** NNPACK function was called before the library was initialized */
+	nnp_status_uninitialized = 50,
+	/** NNPACK does not implement this function for the host CPU */
+	nnp_status_unsupported_hardware = 51,
+	/** NNPACK failed to allocate memory for temporary buffers */
+	nnp_status_out_of_memory = 52,
+	/** Scratch space buffer is too small */
+	nnp_status_insufficient_buffer = 53,
+	/** Scratch space buffer is not properly aligned */
+	nnp_status_misaligned_buffer = 54
+};
+
+/**
+ * @brief Activation applied applied after a convolutional or fully-connected layer.
+ */
+enum nnp_activation {
+	/** Identity activation f(x) := x, i.e. no transformation */
+	nnp_activation_identity = 0,
+	/** ReLU activation f(x) := max(0, x) */
+	nnp_activation_relu = 1,
+};
+
+/**
+ * @brief Algorithm for computing convolutional layers.
+ */
+enum nnp_convolution_algorithm {
+	/** Let NNPACK choose the algorithm depending on layer parameters */
+	nnp_convolution_algorithm_auto = 0,
+	/** Tiled convolution based on 2D Fourier transform with 8x8 blocks. Supports kernels up to 8x8. */
+	nnp_convolution_algorithm_ft8x8 = 1,
+	/** Tiled convolution based on 2D Fourier transform with 16x16 blocks. Supports kernels up to 16x16. */
+	nnp_convolution_algorithm_ft16x16 = 2,
+	/** Tiled convolution based on 2D Winograd transform F(3x3, 6x6) with 8x8 blocks. Supports only 3x3 kernels. */
+	nnp_convolution_algorithm_wt8x8 = 3,
+	/** Direct convolution via implicit GEMM. */
+	nnp_convolution_algorithm_implicit_gemm = 4,
+	/** Direct convolution implementation. */
+	nnp_convolution_algorithm_direct = 5,
+	/**
+	 * Tiled convolution based on 2D Winograd transform F(3x3, 6x6) with 8x8 blocks in FP16.
+	 * Supports only 3x3 kernels. Implemented only for new ARM processors (with NEON-HP),
+	 * on non-supported processors falls back to nnp_convolution_algorithm_wt8x8.
+	 */
+	nnp_convolution_algorithm_wt8x8_fp16 = 6,
+};
+
+enum nnp_convolution_transform_strategy {
+	nnp_convolution_transform_strategy_compute = 1,
+	nnp_convolution_transform_strategy_precompute = 2,
+	nnp_convolution_transform_strategy_reuse = 3
+};
+
+/* For backward compatibility */
+#define nnp_convolution_transform_strategy_block_based nnp_convolution_transform_strategy_compute
+#define nnp_convolution_transform_strategy_tuple_based nnp_convolution_transform_strategy_compute
+
+/**
+ * @brief Size of images, kernels, and pooling filters in NNPACK.
+ */
+struct nnp_size {
+	/** Width (horizontal size) of an image, kernel, or pooling filter. */
+	size_t width;
+	/** Height (vertical size) of an image, kernel, or pooling filter. */
+	size_t height;
+};
+
+/**
+ * @brief Padding of images in NNPACK.
+ */
+struct nnp_padding {
+	/** Padding above the image data */
+	size_t top;
+	/** Padding on the right of image data */
+	size_t right;
+	/** Padding below the image data */
+	size_t bottom;
+	/** Padding on the left of image data */
+	size_t left;
+};
+
+/**
+ * @brief Profiling information about time spent in different phases of a function call.
+ */
+struct nnp_profile {
+	/** Time spent inside the function call, in seconds. */
+	double total;
+	/** Time spend on transformation of the input or input gradient tensor, in seconds. */
+	double input_transform;
+	/** Time spend on transformation of the kernel or kernel gradient tensor, in seconds. */
+	double kernel_transform;
+	/** Time spend on transformation of the output or output gradient tensor, in seconds. */
+	double output_transform;
+	/** Time spend on multiplication-accumulation of transformed coefficients, in seconds. */
+	double block_multiplication;
+};
+
+enum nnp_status nnp_initialize(void);
+
+enum nnp_status nnp_deinitialize(void);
+
+/**
+ * @brief Computes output of a 2D convolutional layer from input and kernel tensors.
+ * @details This function targets training of convolutional neural networks and performs forward propagation.
+ *          It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch.
+ *          For minibatch size 1, use nnp_convolution_inference for optimal performance.
+ * @param algorithm The type of algorithm to use for convolution. Possible values are:
+ *
+ *    - nnp_convolution_algorithm_auto    -- let the function choose the algorithm.
+ *    - nnp_convolution_algorithm_ft8x8   -- tiled convolution based on 2D Fourier transform with 8x8 blocks.
+ *                                           Supports kernels up to 8x8.
+ *    - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks.
+ *                                           Supports kernels up to 16x16.
+ *    - nnp_convolution_algorithm_wt8x8   -- tiled convolution based on 2D Winograd transform F(3x3, 6x6).
+ *                                           Supports only 3x3 kernels.
+ *
+ * @param batch_size The number of images on the input and output of the convolutional layer.
+ * @param input_channels The number of channels (AKA features, dimensions) in the input images.
+ * @param output_channels The number of channels (AKA features, dimensions) in the output images.
+ * @param input_size Size of input images, excluding implicit zero-padding.
+ * @param input_padding Implicit zero-padding of input images.
+ * @param kernel_size Kernel size.
+ * @param[in]  input  A 4D tensor input[batch_size][input_channels][input_size.height][input_size.width].
+ * @param[in]  kernel A 4D tensor kernel[output_channels][input_channels][kernel_size.height][kernel_size.width].
+ * @param[in]  bias   A 1D array bias[output_channels].
+ * @param[out] output A 4D tensor output[batch_size][output_channels][output_size.height][output_size.width] where
+ *                        output_size.height = (input_padding.top + input_size.height + input_padding.bottom) -
+ *                                             (kernel_size.height - 1)
+ *                        output_size.width  = (input_padding.left + input_size.width + input_padding.right) -
+ *                                             (kernel_size.width - 1)
+ * @param threadpool A thread pool for parallelization of the computation.
+ *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
+ * @param[out] profile An optional pointer to profiling structure.
+ *                     If provided, the structure would record time spent in different phases of the computation.
+ */
+
+enum nnp_status nnp_convolution_output(
+	enum nnp_convolution_algorithm algorithm,
+	size_t batch_size,
+	size_t input_channels,
+	size_t output_channels,
+	struct nnp_size input_size,
+	struct nnp_padding input_padding,
+	struct nnp_size kernel_size,
+	const float* input,
+	const float* kernel,
+	const float* bias,
+	float* output,
+	void* workspace_buffer,
+	size_t* workspace_size,
+	enum nnp_activation activation,
+	const void* activation_parameters,
+	pthreadpool_t threadpool,
+	struct nnp_profile* profile);
+
+/**
+ * @brief Computes gradient of input of a 2D convolutional layer from gradient of output and kernel tensors.
+ * @details This function targets training of convolutional neural networks and performs backward propagation.
+ *          It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch.
+ * @param algorithm The type of algorithm to use for convolution. Possible values are:
+ *
+ *    - nnp_convolution_algorithm_auto    -- let the function choose the algorithm.
+ *    - nnp_convolution_algorithm_ft8x8   -- tiled convolution based on 2D Fourier transform with 8x8 blocks.
+ *                                           Supports kernels up to 8x8.
+ *    - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks.
+ *                                           Supports kernels up to 16x16.
+ *    - nnp_convolution_algorithm_wt8x8   -- tiled convolution based on 2D Winograd transform F(3x3, 6x6).
+ *                                           Supports only 3x3 kernels.
+ *
+ * @param batch_size The number of images (and their gradients) on the input and output of the convolutional layer.
+ * @param input_channels The number of channels (AKA features, dimensions) in the input images (and gradients).
+ * @param output_channels The number of channels (AKA features, dimensions) in the output images (and gradients).
+ * @param input_size Size of input images and their gradients, excluding implicit zero-padding.
+ * @param input_padding Implicit zero-padding of input images.
+ * @param kernel_size Kernel size.
+ * @param[in]  grad_output A 4D tensor grad_output[batch_size][output_channels][output_size.height][output_size.width]
+ *                         where
+ *                           output_size.height = (input_padding.top + input_size.height + input_padding.bottom) -
+ *                                                (kernel_size.height - 1)
+ *                           output_size.width  = (input_padding.left + input_size.width + input_padding.right) -
+ *                                                (kernel_size.width - 1)
+ * @param[in]  kernel      A 4D tensor kernel[output_channels][input_channels][kernel_size.height][kernel_size.width].
+ * @param[out] grad_input  A 4D tensor grad_input[batch_size][input_channels][input_size.height][input_size.width].
+ * @param threadpool A thread pool for parallelization of the computation.
+ *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
+ * @param[out] profile An optional pointer to profiling structure.
+ *                     If provided, the structure would record time spent in different phases of the computation.
+ */
+enum nnp_status nnp_convolution_input_gradient(
+	enum nnp_convolution_algorithm algorithm,
+	size_t batch_size,
+	size_t input_channels,
+	size_t output_channels,
+	struct nnp_size input_size,
+	struct nnp_padding input_padding,
+	struct nnp_size kernel_size,
+	const float* grad_output,
+	const float* kernel,
+	float* grad_input,
+	void* workspace_buffer,
+	size_t* workspace_size,
+	enum nnp_activation activation,
+	const void* activation_parameters,
+	pthreadpool_t threadpool,
+	struct nnp_profile* profile);
+
+/**
+ * @brief Computes gradient of kernel of a 2D convolutional layer from gradient of output and input tensors.
+ * @details This function targets training of convolutional neural networks and performs backward propagation.
+ *          It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch.
+ * @param algorithm The type of algorithm to use for convolution. Possible values are:
+ *
+ *    - nnp_convolution_algorithm_auto    -- let the function choose the algorithm.
+ *    - nnp_convolution_algorithm_ft8x8   -- tiled convolution based on 2D Fourier transform with 8x8 blocks.
+ *                                           Supports kernels up to 8x8.
+ *    - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks.
+ *                                           Supports kernels up to 16x16.
+ *
+ * @param batch_size The number of images (and their gradients) on the input and output of the convolutional layer.
+ * @param input_channels The number of channels (AKA features, dimensions) in the input images.
+ * @param output_channels The number of channels (AKA features, dimensions) in the output images (and gradients).
+ * @param input_size Size of input images and their gradients, excluding implicit zero-padding.
+ * @param input_padding Implicit zero-padding of input images.
+ * @param kernel_size Kernel size.
+ * @param[in]  input       A 4D tensor input[batch_size][input_channels][input_size.height][input_size.width].
+ * @param[in]  grad_output A 4D tensor grad_output[batch_size][output_channels][output_size.height][output_size.width]
+ *                         where
+ *                           output_size.height = (input_padding.top + input_size.height + input_padding.bottom) -
+ *                                                (kernel_size.height - 1)
+ *                           output_size.width  = (input_padding.left + input_size.width + input_padding.right) -
+ *                                                (kernel_size.width - 1)
+ * @param[out] grad_kernel A 4D tensor
+ *                         grad_kernel[output_channels][input_channels][kernel_size.height][kernel_size.width].
+ * @param threadpool A thread pool for parallelization of the computation.
+ *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
+ * @param[out] profile An optional pointer to profiling structure.
+ *                     If provided, the structure would record time spent in different phases of the computation.
+ */
+enum nnp_status nnp_convolution_kernel_gradient(
+	enum nnp_convolution_algorithm algorithm,
+	size_t batch_size,
+	size_t input_channels,
+	size_t output_channels,
+	struct nnp_size input_size,
+	struct nnp_padding input_padding,
+	struct nnp_size kernel_size,
+	const float* input,
+	const float* grad_output,
+	float* grad_kernel,
+	void* workspace_buffer,
+	size_t* workspace_size,
+	enum nnp_activation activation,
+	const void* activation_parameters,
+	pthreadpool_t threadpool,
+	struct nnp_profile* profile);
+
+/**
+ * @brief Computes output of a 2D convolutional layer for a single input image and a kernel tensor.
+ * @details This function targets prediction with convolutional neural networks and performs forward propagation.
+ * @param algorithm The type of algorithm to use for convolution. Possible values are:
+ *
+ *    - nnp_convolution_algorithm_auto    -- let the function choose the algorithm.
+ *    - nnp_convolution_algorithm_ft8x8   -- tiled convolution based on 2D Fourier transform with 8x8 blocks.
+ *                                           Supports kernels up to 8x8.
+ *    - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks.
+ *                                           Supports kernels up to 16x16.
+ *    - nnp_convolution_algorithm_wt8x8   -- tiled convolution based on 2D Winograd transform F(3x3, 6x6).
+ *                                           Supports only 3x3 kernels.
+ *
+ * @param transform_strategy A strategy that guides computation of kernel transforms coefficients.
+ *                           Possible values are:
+ *
+ *    - nnp_convolution_transform_strategy_block_based -- do multiplication-accumulations on blocks of transformed
+ *                                                        coefficients.
+ *    - nnp_convolution_transform_strategy_tuple_based -- do multiplication-accumulations on tuples of transformed
+ *                                                        coefficients.
+ *
+ * @param input_channels The number of channels (AKA features, dimensions) in the input image.
+ * @param output_channels The number of channels (AKA features, dimensions) in the output image.
+ * @param input_size Size of input image, excluding implicit zero-padding.
+ * @param input_padding Implicit zero-padding of input image.
+ * @param kernel_size Kernel size.
+ * @param output_subsampling Subsample region for output, also known as convolution stride.
+ * @param[in]  input  A 3D tensor input[input_channels][input_size.height][input_size.width].
+ * @param[in]  kernel A 4D tensor kernel[output_channels][input_channels][kernel_size.height][kernel_size.width].
+ * @param[in]  bias   A 1D array bias[output_channels].
+ * @param[out] output A 3D tensor output[output_channels][output_size.height][output_size.width] where
+ *                        output_size.height = (input_padding.top + input_size.height + input_padding.bottom) -
+ *                                             (kernel_size.height - 1)
+ *                        output_size.width  = (input_padding.left + input_size.width + input_padding.right) -
+ *                                             (kernel_size.width - 1)
+ * @param[in] workspace_buffer Buffer for scratch memory used during computation. Buffer must be aligned on 64 bytes.
+ *                             If workspace_buffer is NULL and workspace_size is non-NULL, NNPACK would store the size
+ *                             of required workspace memory at the workspace_size location, and exit without
+ *                             computations.
+ *                             If workspace_buffer is NULL and workspace_size is NULL, NNPACK would allocate memory
+ *                             before and deallocate after this computation, potentially at significant runtime cost.
+ * @param[in,out] workspace_size Pointer to the size of workspace buffer.
+ *                               If workspace_buffer is NULL, NNPACK will write the size of required scratch memory to
+ *                               the location specified by this pointer.
+ *                               If workspace_buffer is non-NULL, NNPACK expects workspace_size to specify the size of
+ *                               the buffer, in bytes.
+ *                               If workspace_size is NULL, workspace_buffer must be NULL as well. In this case NNPACK
+ *                               would allocate memory before and deallocate after this computation, potentially at
+ *                               significant runtime cost.
+ * @param threadpool A thread pool for parallelization of the computation.
+ *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
+ * @param[out] profile An optional pointer to profiling structure.
+ *                     If provided, the structure would record time spent in different phases of the computation.
+ */
+enum nnp_status nnp_convolution_inference(
+	enum nnp_convolution_algorithm algorithm,
+	enum nnp_convolution_transform_strategy transform_strategy,
+	size_t input_channels,
+	size_t output_channels,
+	struct nnp_size input_size,
+	struct nnp_padding input_padding,
+	struct nnp_size kernel_size,
+	struct nnp_size output_subsampling,
+	const float* input,
+	const float* kernel,
+	const float* bias,
+	float* output,
+	void* workspace_buffer,
+	size_t* workspace_size,
+	enum nnp_activation activation,
+	const void* activation_parameters,
+	pthreadpool_t threadpool,
+	struct nnp_profile* profile);
+
+/**
+ * @brief Computes output of a fully connected layer from input and kernel matrices.
+ * @details This function targets training of convolutional neural networks and performs forward propagation.
+ *          It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch.
+ *          For minibatch size 1, use nnp_fully_connected_inference for optimal performance.
+ * @param batch_size The number of vectors on the input and output of the fully connected layer.
+ * @param input_channels The number of channels (AKA features, dimensions) in the input matrix.
+ * @param output_channels The number of channels (AKA features, dimensions) in the output matrix.
+ * @param[in]  input  A 2D matrix input[batch_size][input_channels].
+ * @param[in]  kernel A 2D matrix kernel[output_channels][input_channels].
+ * @param[out] output A 2D matrix output[batch_size][output_channels].
+ * @param threadpool A thread pool for parallelization of the computation.
+ *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
+ */
+enum nnp_status nnp_fully_connected_output(
+	size_t batch_size,
+	size_t input_channels,
+	size_t output_channels,
+	const float input[],
+	const float kernel[],
+	float output[],
+	pthreadpool_t threadpool,
+	struct nnp_profile* profile);
+
+/**
+ * @brief Computes output of a fully connected layer for a single input vector and a kernel matrix.
+ * @details This function targets prediction with convolutional neural networks and performs forward propagation.
+ * @param input_channels The number of channels (AKA features, dimensions) in the input vector.
+ * @param output_channels The number of channels (AKA features, dimensions) in the output vector.
+ * @param[in]  input  A 1D array input[input_channels] of FP32 elements.
+ * @param[in]  kernel A 2D matrix kernel[output_channels][input_channels] of FP32 elements.
+ * @param[out] output A 1D array output[output_channels] of FP32 elements.
+ * @param threadpool A thread pool for parallelization of the computation.
+ *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
+ */
+enum nnp_status nnp_fully_connected_inference(
+	size_t input_channels,
+	size_t output_channels,
+	const float* input,
+	const float* kernel,
+	float* output,
+	pthreadpool_t threadpool);
+
+/**
+ * @brief Computes output of a fully connected layer for a single input vector and a kernel matrix.
+ * @details This function targets prediction with convolutional neural networks and performs forward propagation.
+ * @param input_channels The number of channels (AKA features, dimensions) in the input vector.
+ * @param output_channels The number of channels (AKA features, dimensions) in the output vector.
+ * @param[in]  input  A 1D array input[input_channels] of FP32 elements.
+ * @param[in]  kernel A 2D matrix kernel[output_channels][input_channels] of FP16 (ARM alternative format) elements.
+ * @param[out] output A 1D array output[output_channels] of FP32 elements.
+ * @param threadpool A thread pool for parallelization of the computation.
+ *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
+ */
+enum nnp_status nnp_fully_connected_inference_f16f32(
+	size_t input_channels,
+	size_t output_channels,
+	const float* input,
+	const void* kernel,
+	float* output,
+	pthreadpool_t threadpool);
+
+/**
+ * @brief Computes output of a max-pooling layer for an input tensor.
+ * @details This function targets both prediction and training of convolutional neural networks and performs forward
+ *          propagation. Is is optimized for both large and small minibatch sizes.
+ * @param batch_size The number of images on the input and output of the max-pooling layer.
+ * @param channels   The number of channels (AKA features, dimensions) in both input and output images.
+ * @param input_size Size of input images, excluding implicit zero-padding.
+ * @param input_padding Implicit padding of input images. The padding pixels are ignored by the pooling filter, but
+ *                      affect the output size.
+ * @param pooling_size   Size of the pooling filter. Only 2x2 filter are currently supported.
+ * @param pooling_stride Stride of the pooling filter. Only 2x2 strides are currently supported.
+ * @param[in]  input  A 4D tensor input[batch_size][channels][input_size.height][input_size.width].
+ * @param[out] output A 4D tensor output[batch_size][channels][output_size.height][output_size.width] where
+ *                    output_size.height = ceil(
+ *                      (input_padding.top + input_size.height + input_padding.bottom - pooling_size.height) /
+ *                        pooling_stride.height) + 1
+ *                    output_size.width = ceil(
+ *                      (input_padding.left + input_size.width + input_padding.right - pooling_size.width) /
+ *                        pooling_stride.width) + 1
+ * @param threadpool A thread pool for parallelization of the computation.
+ *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
+ */
+enum nnp_status nnp_max_pooling_output(
+	size_t batch_size,
+	size_t channels,
+	struct nnp_size input_size,
+	struct nnp_padding input_padding,
+	struct nnp_size pooling_size,
+	struct nnp_size pooling_stride,
+	const float input[],
+	float output[],
+	pthreadpool_t threadpool);
+
+/**
+ * @brief Computes output of a softmax layer for an input matrix.
+ * @details This function targets both prediction and training of convolutional neural networks and performs forward
+ *          propagation. Is is optimized for both large and small minibatch sizes.
+ * @param batch_size The number of vectors on the input and output of the softmax layer.
+ * @param channels   The number of channels (AKA features, dimensions) in both input and output vectors.
+ * @param[in]  input  A 2D matrix input[batch_size][channels].
+ * @param[out] output A 2D matrix output[batch_size][channels].
+ * @param threadpool A thread pool for parallelization of the computation.
+ *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
+ */
+enum nnp_status nnp_softmax_output(
+    size_t batch_size,
+    size_t channels,
+    const float input[],
+    float output[],
+    pthreadpool_t threadpool);
+
+/**
+ * @brief Computes output of a rectified linear unit (ReLU) layer for an input matrix.
+ * @details This function targets both prediction and training of convolutional neural networks and performs forward
+ *          propagation. Is is optimized for both large and small minibatch sizes.
+ * @param batch_size The number of vectors on the input and output of the ReLU layer.
+ * @param channels   The number of channels (AKA features, dimensions) in both input and output matrices.
+ * @param[in]  input  A 2D matrix input[batch_size][channels].
+ * @param[out] output A 2D matrix output[batch_size][channels].
+ * @param threadpool A thread pool for parallelization of the computation.
+ *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
+ */
+enum nnp_status nnp_relu_output(
+	size_t batch_size,
+	size_t channels,
+	const float input[],
+	float output[],
+	float negative_slope,
+	pthreadpool_t threadpool);
+
+/**
+ * @brief Computes gradient of input of a rectified linear unit (ReLU) layer from gradient of output and input matrices.
+ * @details This function targets training of convolutional neural networks and performs backward propagation.
+ *          Is is optimized for both large and small minibatch sizes.
+ * @param batch_size The number of vectors on the input and output of the ReLU layer.
+ * @param channels   The number of channels (AKA features, dimensions) in both input and output matrices.
+ * @param[in]  input  A 2D matrix input[batch_size][channels].
+ * @param[out] output A 2D matrix output[batch_size][channels].
+ * @param threadpool A thread pool for parallelization of the computation.
+ *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
+ */
+enum nnp_status nnp_relu_input_gradient(
+	size_t batch_size,
+	size_t channels,
+	const float grad_output[],
+	const float input[],
+	float grad_input[],
+	float negative_slope,
+	pthreadpool_t threadpool);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#ifdef __cplusplus
+// Backward compatible implementations for nnp_convolution_*, if we are in C++
+// mode.
+inline enum nnp_status nnp_convolution_output(
+	enum nnp_convolution_algorithm algorithm,
+	size_t batch_size,
+	size_t input_channels,
+	size_t output_channels,
+	struct nnp_size input_size,
+	struct nnp_padding input_padding,
+	struct nnp_size kernel_size,
+	const float input[],
+	const float kernel[],
+	const float bias[],
+	float output[],
+	pthreadpool_t threadpool,
+	struct nnp_profile* profile)
+{
+	return nnp_convolution_output(
+		algorithm,
+		batch_size, input_channels, output_channels,
+		input_size, input_padding, kernel_size,
+		input, kernel, bias, output,
+		NULL, NULL,
+		nnp_activation_identity, NULL, threadpool, profile);
+}
+
+inline enum nnp_status nnp_convolution_input_gradient(
+	enum nnp_convolution_algorithm algorithm,
+	size_t batch_size,
+	size_t input_channels,
+	size_t output_channels,
+	struct nnp_size input_size,
+	struct nnp_padding input_padding,
+	struct nnp_size kernel_size,
+	const float grad_output[],
+	const float kernel[],
+	float grad_input[],
+	pthreadpool_t threadpool,
+	struct nnp_profile* profile)
+{
+	return nnp_convolution_input_gradient(
+		algorithm,
+		batch_size, input_channels, output_channels,
+		input_size, input_padding, kernel_size,
+		grad_output, kernel, grad_input,
+		NULL, NULL,
+		nnp_activation_identity, NULL, threadpool, profile);
+}
+
+inline enum nnp_status nnp_convolution_kernel_gradient(
+	enum nnp_convolution_algorithm algorithm,
+	size_t batch_size,
+	size_t input_channels,
+	size_t output_channels,
+	struct nnp_size input_size,
+	struct nnp_padding input_padding,
+	struct nnp_size kernel_size,
+	const float input[],
+	const float grad_output[],
+	float grad_kernel[],
+	pthreadpool_t threadpool,
+	struct nnp_profile* profile)
+{
+	return nnp_convolution_kernel_gradient(
+		algorithm,
+		batch_size, input_channels, output_channels,
+		input_size, input_padding, kernel_size,
+		input, grad_output, grad_kernel,
+		NULL, NULL,
+		nnp_activation_identity, NULL, threadpool, profile);
+}
+
+inline enum nnp_status nnp_convolution_inference(
+	enum nnp_convolution_algorithm algorithm,
+	enum nnp_convolution_transform_strategy transform_strategy,
+	size_t input_channels,
+	size_t output_channels,
+	struct nnp_size input_size,
+	struct nnp_padding input_padding,
+	struct nnp_size kernel_size,
+	struct nnp_size output_subsampling,
+	const float input[],
+	const float kernel[],
+	const float bias[],
+	float output[],
+	pthreadpool_t threadpool,
+	struct nnp_profile* profile) {
+	return nnp_convolution_inference(
+		algorithm, transform_strategy,
+		input_channels, output_channels,
+		input_size, input_padding, kernel_size, output_subsampling,
+		input, kernel, bias, output, NULL, NULL,
+		nnp_activation_identity, NULL,
+		threadpool, profile);
+}
+
+#endif // __cplusplus
diff --git a/.venv/lib/python3.11/site-packages/torch/include/qnnpack_func.h b/.venv/lib/python3.11/site-packages/torch/include/qnnpack_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..10bbc000192d7e03745e2cf3fb263a9655cde00c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/qnnpack_func.h
@@ -0,0 +1,166 @@
+#pragma once
+
+#include <cstdlib>
+#include <qnnpack/operator.h>
+
+namespace qnnpack {
+class PrePackConvWeights final {
+ public:
+  PrePackConvWeights(
+      const pytorch_qnnp_operator_t convolution,
+      const uint8_t* kernel_zero_points,
+      const uint8_t* kernel,
+      const int32_t* bias);
+
+  void* getPackedWeights() const
+  {
+    return packed_weights_;
+  }
+
+  int64_t getOutputChannels() const
+  {
+    return output_channels_;
+  }
+
+  ~PrePackConvWeights()
+  {
+    if (packed_weights_ != nullptr) {
+      free(packed_weights_);
+    }
+  }
+
+  PrePackConvWeights() = delete;
+  PrePackConvWeights(const PrePackConvWeights&) = delete;
+  PrePackConvWeights& operator=(const PrePackConvWeights&) = delete;
+
+ private:
+  void* packed_weights_ = nullptr;
+  int64_t output_channels_;
+};
+
+class PackBMatrix final {
+ public:
+  PackBMatrix(
+      size_t input_channels,
+      size_t output_channels,
+      const uint8_t* kernel_zero_points,
+      const float* requantization_scale,
+      const uint8_t* kernel,
+      const int32_t* bias);
+
+  // This constructor is to be used for dynamic mode
+  // quantization. In dynamic mode, we dont yet support
+  // per channel quantization, and paying the cost of
+  // memory allocation for per channel zero point and
+  // requant scale will hurt performance.
+  PackBMatrix(
+      size_t input_channels,
+      size_t output_channels,
+      const uint8_t kernel_zero_point,
+      const float requantization_scale,
+      const uint8_t* kernel,
+      const int32_t* bias);
+
+  void* getPackedWeights() const
+  {
+    return packed_weights_;
+  }
+
+  void unpackWeights(
+      const uint8_t* kernel_zero_points,
+      int8_t* kernel
+    ) const;
+
+  size_t getInputChannels() const
+  {
+    return input_channels_;
+  }
+
+  size_t getOutputChannels() const
+  {
+    return output_channels_;
+  }
+
+  ~PackBMatrix()
+  {
+    if (packed_weights_ != nullptr) {
+      free(packed_weights_);
+    }
+  }
+
+  PackBMatrix() = delete;
+  PackBMatrix(const PackBMatrix&) = delete;
+  PackBMatrix& operator=(const PackBMatrix&) = delete;
+
+ private:
+  void* packed_weights_ = nullptr;
+  size_t input_channels_;
+  size_t output_channels_;
+};
+
+enum pytorch_qnnp_status qnnpackLinear(
+    const size_t batch_size,
+    const size_t input_channels,
+    const size_t output_channels,
+    const uint8_t input_zero_point,
+    const uint8_t* kernel_zero_points,
+    const float* requantization_scales,
+    const uint8_t output_zero_point,
+    const uint8_t output_min,
+    const uint8_t output_max,
+    const uint8_t* input,
+    const size_t input_stride,
+    void* packed_weights,
+    uint8_t* output,
+    const size_t output_stride,
+    pthreadpool_t threadpool);
+
+enum pytorch_qnnp_status qnnpackConv(
+    const pytorch_qnnp_operator_t convolution,
+    void* packed_weights,
+    const size_t batch_size,
+    const size_t input_depth,
+    const size_t input_height,
+    const size_t input_width,
+    const uint8_t input_zero_point,
+    const uint8_t* input,
+    const uint8_t* kernel_zero_points,
+    const float* requantization_scales,
+    const uint8_t output_zero_point,
+    const uint8_t output_min,
+    const uint8_t output_max,
+    uint8_t* output,
+    pthreadpool_t threadpool);
+
+enum pytorch_qnnp_status qnnpackDeConv(
+    const pytorch_qnnp_operator_t deconvolution,
+    void* packed_weights,
+    const size_t batch_size,
+    const size_t input_height,
+    const size_t input_width,
+    const uint8_t input_zero_point,
+    const uint8_t* input,
+    const uint8_t* kernel_zero_points,
+    const float* requantization_scales,
+    const uint8_t output_zero_point,
+    const uint8_t output_min,
+    const uint8_t output_max,
+    uint8_t* output,
+    pthreadpool_t threadpool);
+
+enum pytorch_qnnp_status qnnpackLinearDynamic(
+    const size_t batch_size,
+    const size_t input_channels,
+    const size_t output_channels,
+    const uint8_t input_zero_point,
+    const uint8_t* kernel_zero_points,
+    const float* dequantization_scales,
+    const uint8_t* input,
+    const size_t input_stride,
+    void* packed_weights,
+    const float* bias,
+    float* output,
+    const size_t output_stride,
+    pthreadpool_t threadpool);
+
+} // namespace qnnpack
diff --git a/.venv/lib/python3.11/site-packages/torch/include/sleef.h b/.venv/lib/python3.11/site-packages/torch/include/sleef.h
new file mode 100644
index 0000000000000000000000000000000000000000..292ac5b8be30c5766679ce2dd562014fdf50d4f2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/sleef.h
@@ -0,0 +1,4170 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2023.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef __SLEEF_H__
+#define __SLEEF_H__
+
+#define SLEEF_VERSION_MAJOR 3
+#define SLEEF_VERSION_MINOR 6
+#define SLEEF_VERSION_PATCHLEVEL 0
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)
+#define SLEEF_CONST __attribute__((const))
+#define SLEEF_INLINE __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define SLEEF_CONST
+#define SLEEF_INLINE __forceinline
+#endif
+
+#if defined(__AVX2__) || defined(__aarch64__) || defined(__arm__) || defined(__powerpc64__) || defined(__zarch__)
+#ifndef FP_FAST_FMA
+#define FP_FAST_FMA
+#endif
+#ifndef FP_FAST_FMAF
+#define FP_FAST_FMAF
+#endif
+#endif
+
+#if defined(_MSC_VER) && !defined(__STDC__)
+#define __STDC__ 1
+#endif
+
+#if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS)
+#ifdef SLEEF_IMPORT_IS_EXPORT
+#define SLEEF_IMPORT __declspec(dllexport)
+#else // #ifdef SLEEF_IMPORT_IS_EXPORT
+#define SLEEF_IMPORT __declspec(dllimport)
+#if (defined(_MSC_VER))
+#pragma comment(lib,"sleef.lib")
+#endif // #if (defined(_MSC_VER))
+#endif // #ifdef SLEEF_IMPORT_IS_EXPORT
+#else // #if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS)
+#define SLEEF_IMPORT
+#endif // #if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS)
+
+#if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__))
+#include <x86intrin.h>
+#endif
+
+#if (defined(_MSC_VER))
+#include <intrin.h>
+#endif
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
+
+#if defined(__ARM_FEATURE_SVE)
+#include <arm_sve.h>
+#endif
+
+#if defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__)
+#include <altivec.h>
+typedef __vector double       SLEEF_VECTOR_DOUBLE;
+typedef __vector float        SLEEF_VECTOR_FLOAT;
+typedef __vector int          SLEEF_VECTOR_INT;
+typedef __vector unsigned int SLEEF_VECTOR_UINT;
+typedef __vector long long SLEEF_VECTOR_LONGLONG;
+typedef __vector unsigned long long SLEEF_VECTOR_ULONGLONG;
+#endif
+
+#if defined(__VX__) && defined(__VEC__)
+#ifndef SLEEF_VECINTRIN_H_INCLUDED
+#include <vecintrin.h>
+#define SLEEF_VECINTRIN_H_INCLUDED
+#endif
+typedef __vector double       SLEEF_VECTOR_DOUBLE;
+typedef __vector float        SLEEF_VECTOR_FLOAT;
+typedef __vector int          SLEEF_VECTOR_INT;
+typedef __vector unsigned int SLEEF_VECTOR_UINT;
+typedef __vector long long SLEEF_VECTOR_LONGLONG;
+typedef __vector unsigned long long SLEEF_VECTOR_ULONGLONG;
+#endif
+
+//
+
+#if defined(SLEEF_ENABLE_OMP_SIMD) && (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
+#if defined(__aarch64__)
+//#define SLEEF_PRAGMA_OMP_SIMD_DP _Pragma ("omp declare simd simdlen(2) notinbranch")
+//#define SLEEF_PRAGMA_OMP_SIMD_SP _Pragma ("omp declare simd simdlen(4) notinbranch")
+//#elif defined(__x86_64__) && defined(__AVX512F__)
+//#define SLEEF_PRAGMA_OMP_SIMD_DP _Pragma ("omp declare simd simdlen(8) notinbranch")
+//#define SLEEF_PRAGMA_OMP_SIMD_SP _Pragma ("omp declare simd simdlen(16) notinbranch")
+#elif defined(__x86_64__) && defined(__AVX__)
+#define SLEEF_PRAGMA_OMP_SIMD_DP _Pragma ("omp declare simd simdlen(4) notinbranch")
+#define SLEEF_PRAGMA_OMP_SIMD_SP _Pragma ("omp declare simd simdlen(8) notinbranch")
+#elif defined(__x86_64__) && defined(__SSE2__)
+#define SLEEF_PRAGMA_OMP_SIMD_DP _Pragma ("omp declare simd simdlen(2) notinbranch")
+#define SLEEF_PRAGMA_OMP_SIMD_SP _Pragma ("omp declare simd simdlen(4) notinbranch")
+#endif
+#endif
+
+#ifndef SLEEF_PRAGMA_OMP_SIMD_DP
+#define SLEEF_PRAGMA_OMP_SIMD_DP
+#define SLEEF_PRAGMA_OMP_SIMD_SP
+#endif
+
+//
+
+#ifndef SLEEF_FP_ILOGB0
+#define SLEEF_FP_ILOGB0 ((int)0x80000000)
+#endif
+
+#ifndef SLEEF_FP_ILOGBNAN
+#define SLEEF_FP_ILOGBNAN ((int)2147483647)
+#endif
+
+//
+
+SLEEF_IMPORT void *Sleef_malloc(size_t z);
+SLEEF_IMPORT void Sleef_free(void *ptr);
+SLEEF_IMPORT uint64_t Sleef_currentTimeMicros();
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)
+SLEEF_IMPORT void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
+#endif
+
+//
+
+#if defined(__riscv_v)
+#include <riscv_vector.h>
+typedef vfloat64m2_t Sleef_vfloat64m1_t_2;
+typedef vfloat32m2_t Sleef_vfloat32m1_t_2;
+typedef vfloat64m4_t Sleef_vfloat64m2_t_2;
+typedef vfloat32m4_t Sleef_vfloat32m2_t_2;
+#define Sleef_vfloat64m1_t_2_DEFINED
+#define Sleef_vfloat32m1_t_2_DEFINED
+#define Sleef_vfloat64m2_t_2_DEFINED
+#define Sleef_vfloat32m2_t_2_DEFINED
+#endif
+
+#ifndef Sleef_double2_DEFINED
+#define Sleef_double2_DEFINED
+typedef struct {
+  double x, y;
+} Sleef_double2;
+#endif
+
+#ifndef Sleef_float2_DEFINED
+#define Sleef_float2_DEFINED
+typedef struct {
+  float x, y;
+} Sleef_float2;
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sin_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cos_u35(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double2 Sleef_sincos_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_tan_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_asin_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_acos_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_atan_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_atan2_u35(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_log_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cbrt_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sin_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cos_u10(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double2 Sleef_sincos_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_tan_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_asin_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_acos_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_atan_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_atan2_u10(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_log_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cbrt_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_exp_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_pow_u10(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sinh_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cosh_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_tanh_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sinh_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cosh_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_tanh_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_asinh_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_acosh_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_atanh_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_exp2_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_exp10_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_exp2_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_exp10_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_expm1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_log10_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_log2_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_log2_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_log1p_u10(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double2 Sleef_sincospi_u05(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double2 Sleef_sincospi_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sinpi_u05(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cospi_u05(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_ldexp(double, int);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST int Sleef_ilogb(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fma(double, double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sqrt(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sqrt_u05(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sqrt_u35(double);
+
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_hypot_u05(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_hypot_u35(double, double);
+
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fabs(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_copysign(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fmax(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fmin(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fdim(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_trunc(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_floor(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_ceil(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_round(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_rint(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_nextafter(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_frfrexp(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST int Sleef_expfrexp(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fmod(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_remainder(double, double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double2 Sleef_modf(double);
+
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_lgamma_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_tgamma_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_erf_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_erfc_u15(double);
+
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sinf_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_cosf_u35(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float2 Sleef_sincosf_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_tanf_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_asinf_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_acosf_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_atanf_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_atan2f_u35(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_logf_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_cbrtf_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sinf_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_cosf_u10(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float2 Sleef_sincosf_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fastsinf_u3500(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fastcosf_u3500(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_tanf_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_asinf_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_acosf_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_atanf_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_atan2f_u10(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_logf_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_cbrtf_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_expf_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_powf_u10(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fastpowf_u3500(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sinhf_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_coshf_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_tanhf_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sinhf_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_coshf_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_tanhf_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_asinhf_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_acoshf_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_atanhf_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_exp2f_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_exp10f_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_exp2f_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_exp10f_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_expm1f_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_log10f_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_log2f_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_log2f_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_log1pf_u10(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float2 Sleef_sincospif_u05(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float2 Sleef_sincospif_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sinpif_u05(float d);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_cospif_u05(float d);
+SLEEF_IMPORT SLEEF_CONST float Sleef_ldexpf(float, int);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST int Sleef_ilogbf(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fmaf(float, float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf_u05(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf_u35(float);
+
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_hypotf_u05(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_hypotf_u35(float, float);
+
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fabsf(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_copysignf(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fmaxf(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fminf(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fdimf(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_truncf(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_floorf(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_ceilf(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_roundf(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_rintf(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_nextafterf(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_frfrexpf(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST int Sleef_expfrexpf(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fmodf(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_remainderf(float, float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float2 Sleef_modff(float);
+
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_lgammaf_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_tgammaf_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_erff_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_erfcf_u15(float);
+#ifdef __SSE2__
+
+#ifndef Sleef___m128d_2_DEFINED
+typedef struct {
+  __m128d x, y;
+} Sleef___m128d_2;
+#define Sleef___m128d_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sind2_u35(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cosd2_u35(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincosd2_u35(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tand2_u35(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asind2_u35(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acosd2_u35(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atand2_u35(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atan2d2_u35(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_logd2_u35(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cbrtd2_u35(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sind2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cosd2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincosd2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tand2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asind2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acosd2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atand2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atan2d2_u10(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_logd2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cbrtd2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_expd2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_powd2_u10(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinhd2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_coshd2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tanhd2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinhd2_u35(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_coshd2_u35(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tanhd2_u35(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastsind2_u3500(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastcosd2_u3500(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastpowd2_u3500(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asinhd2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acoshd2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atanhd2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp2d2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp2d2_u35(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp10d2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp10d2_u35(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_expm1d2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log10d2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log2d2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log2d2_u35(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log1pd2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincospid2_u05(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincospid2_u35(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinpid2_u05(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cospid2_u05(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_ldexpd2(__m128d, __m128i);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_ilogbd2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmad2(__m128d, __m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_u05(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_u35(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_hypotd2_u05(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_hypotd2_u35(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fabsd2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_copysignd2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmaxd2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmind2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fdimd2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_truncd2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_floord2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_ceild2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_roundd2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_rintd2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_nextafterd2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_frfrexpd2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_expfrexpd2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmodd2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_remainderd2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_modfd2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_lgammad2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tgammad2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_erfd2_u10(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_erfcd2_u15(__m128d);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd2(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd2(int);
+
+#ifndef Sleef___m128_2_DEFINED
+typedef struct {
+  __m128 x, y;
+} Sleef___m128_2;
+#define Sleef___m128_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinf4_u35(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cosf4_u35(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincosf4_u35(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanf4_u35(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinf4_u35(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acosf4_u35(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanf4_u35(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atan2f4_u35(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_logf4_u35(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cbrtf4_u35(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinf4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cosf4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincosf4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanf4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinf4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acosf4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanf4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atan2f4_u10(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_logf4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cbrtf4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_expf4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_powf4_u10(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinhf4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_coshf4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanhf4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinhf4_u35(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_coshf4_u35(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanhf4_u35(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastsinf4_u3500(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastcosf4_u3500(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastpowf4_u3500(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinhf4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acoshf4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanhf4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp2f4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp2f4_u35(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp10f4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp10f4_u35(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_expm1f4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log10f4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log2f4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log2f4_u35(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log1pf4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincospif4_u05(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincospif4_u35(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinpif4_u05(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cospif4_u05(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmaf4(__m128, __m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_u05(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_u35(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_hypotf4_u05(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_hypotf4_u35(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fabsf4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_copysignf4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmaxf4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fminf4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fdimf4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_truncf4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_floorf4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_ceilf4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_roundf4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_rintf4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_nextafterf4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_frfrexpf4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmodf4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_remainderf4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_modff4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_lgammaf4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tgammaf4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_erff4_u10(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_erfcf4_u15(__m128);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf4(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf4(int);
+#endif
+#ifdef __SSE2__
+
+#ifndef Sleef___m128d_2_DEFINED
+typedef struct {
+  __m128d x, y;
+} Sleef___m128d_2;
+#define Sleef___m128d_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sind2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sind2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cosd2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_cosd2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincosd2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_cinz_sincosd2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tand2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_tand2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asind2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_asind2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acosd2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_acosd2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atand2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_atand2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atan2d2_u35sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_atan2d2_u35sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_logd2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_logd2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cbrtd2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_cbrtd2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sind2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sind2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cosd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_cosd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincosd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_cinz_sincosd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tand2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_tand2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asind2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_asind2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acosd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_acosd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atand2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_atand2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atan2d2_u10sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_atan2d2_u10sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_logd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_logd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cbrtd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_cbrtd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_expd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_expd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_powd2_u10sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_powd2_u10sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinhd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sinhd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_coshd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_coshd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tanhd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_tanhd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinhd2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sinhd2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_coshd2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_coshd2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tanhd2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_tanhd2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastsind2_u3500sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fastsind2_u3500sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastcosd2_u3500sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fastcosd2_u3500sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastpowd2_u3500sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fastpowd2_u3500sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asinhd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_asinhd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acoshd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_acoshd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atanhd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_atanhd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp2d2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_exp2d2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp2d2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_exp2d2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp10d2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_exp10d2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp10d2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_exp10d2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_expm1d2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_expm1d2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log10d2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_log10d2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log2d2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_log2d2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log2d2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_log2d2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log1pd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_log1pd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincospid2_u05sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_cinz_sincospid2_u05sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincospid2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_cinz_sincospid2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinpid2_u05sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sinpid2_u05sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cospid2_u05sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_cospid2_u05sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_ldexpd2_sse2(__m128d, __m128i);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_ldexpd2_sse2(__m128d, __m128i);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_ilogbd2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_cinz_ilogbd2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmad2_sse2(__m128d, __m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fmad2_sse2(__m128d, __m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sqrtd2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_u05sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sqrtd2_u05sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sqrtd2_u35sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_hypotd2_u05sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_hypotd2_u05sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_hypotd2_u35sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_hypotd2_u35sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fabsd2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fabsd2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_copysignd2_sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_copysignd2_sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmaxd2_sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fmaxd2_sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmind2_sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fmind2_sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fdimd2_sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fdimd2_sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_truncd2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_truncd2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_floord2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_floord2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_ceild2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_ceild2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_roundd2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_roundd2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_rintd2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_rintd2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_nextafterd2_sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_nextafterd2_sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_frfrexpd2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_frfrexpd2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_expfrexpd2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_cinz_expfrexpd2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmodd2_sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fmodd2_sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_remainderd2_sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_remainderd2_sse2(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_modfd2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_cinz_modfd2_sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_lgammad2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_lgammad2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tgammad2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_tgammad2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_erfd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_erfd2_u10sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_erfcd2_u15sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_erfcd2_u15sse2(__m128d);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd2_sse2(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd2_sse2(int);
+
+#ifndef Sleef___m128_2_DEFINED
+typedef struct {
+  __m128 x, y;
+} Sleef___m128_2;
+#define Sleef___m128_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sinf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cosf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_cosf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincosf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_cinz_sincosf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_tanf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_asinf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acosf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_acosf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_atanf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atan2f4_u35sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_atan2f4_u35sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_logf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_logf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cbrtf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_cbrtf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sinf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cosf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_cosf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincosf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_cinz_sincosf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_tanf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_asinf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acosf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_acosf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_atanf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atan2f4_u10sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_atan2f4_u10sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_logf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_logf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cbrtf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_cbrtf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_expf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_expf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_powf4_u10sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_powf4_u10sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinhf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sinhf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_coshf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_coshf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanhf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_tanhf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinhf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sinhf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_coshf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_coshf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanhf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_tanhf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastsinf4_u3500sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fastsinf4_u3500sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastcosf4_u3500sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fastcosf4_u3500sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastpowf4_u3500sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fastpowf4_u3500sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinhf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_asinhf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acoshf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_acoshf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanhf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_atanhf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp2f4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_exp2f4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp2f4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_exp2f4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp10f4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_exp10f4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp10f4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_exp10f4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_expm1f4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_expm1f4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log10f4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_log10f4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log2f4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_log2f4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log2f4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_log2f4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log1pf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_log1pf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincospif4_u05sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_cinz_sincospif4_u05sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincospif4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_cinz_sincospif4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinpif4_u05sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sinpif4_u05sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cospif4_u05sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_cospif4_u05sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmaf4_sse2(__m128, __m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fmaf4_sse2(__m128, __m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sqrtf4_sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_u05sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sqrtf4_u05sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sqrtf4_u35sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_hypotf4_u05sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_hypotf4_u05sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_hypotf4_u35sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_hypotf4_u35sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fabsf4_sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fabsf4_sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_copysignf4_sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_copysignf4_sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmaxf4_sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fmaxf4_sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fminf4_sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fminf4_sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fdimf4_sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fdimf4_sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_truncf4_sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_truncf4_sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_floorf4_sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_floorf4_sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_ceilf4_sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_ceilf4_sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_roundf4_sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_roundf4_sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_rintf4_sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_rintf4_sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_nextafterf4_sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_nextafterf4_sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_frfrexpf4_sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_frfrexpf4_sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmodf4_sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fmodf4_sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_remainderf4_sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_remainderf4_sse2(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_modff4_sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_cinz_modff4_sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_lgammaf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_lgammaf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tgammaf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_tgammaf4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_erff4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_erff4_u10sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_erfcf4_u15sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_erfcf4_u15sse2(__m128);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf4_sse2(int);
+SLEEF_IMPORT SLEEF_CONST int Sleef_cinz_getIntf4_sse2(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf4_sse2(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_cinz_getPtrf4_sse2(int);
+#endif
+#ifdef __SSE2__
+
+#ifndef Sleef___m128d_2_DEFINED
+typedef struct {
+  __m128d x, y;
+} Sleef___m128d_2;
+#define Sleef___m128d_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sind2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sind2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cosd2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_cosd2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincosd2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_cinz_sincosd2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tand2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_tand2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asind2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_asind2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acosd2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_acosd2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atand2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_atand2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atan2d2_u35sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_atan2d2_u35sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_logd2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_logd2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cbrtd2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_cbrtd2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sind2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sind2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cosd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_cosd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincosd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_cinz_sincosd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tand2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_tand2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asind2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_asind2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acosd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_acosd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atand2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_atand2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atan2d2_u10sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_atan2d2_u10sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_logd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_logd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cbrtd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_cbrtd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_expd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_expd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_powd2_u10sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_powd2_u10sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinhd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sinhd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_coshd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_coshd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tanhd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_tanhd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinhd2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sinhd2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_coshd2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_coshd2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tanhd2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_tanhd2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastsind2_u3500sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fastsind2_u3500sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastcosd2_u3500sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fastcosd2_u3500sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastpowd2_u3500sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fastpowd2_u3500sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asinhd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_asinhd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acoshd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_acoshd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atanhd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_atanhd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp2d2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_exp2d2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp2d2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_exp2d2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp10d2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_exp10d2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp10d2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_exp10d2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_expm1d2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_expm1d2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log10d2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_log10d2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log2d2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_log2d2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log2d2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_log2d2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log1pd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_log1pd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincospid2_u05sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_cinz_sincospid2_u05sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincospid2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_cinz_sincospid2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinpid2_u05sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sinpid2_u05sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cospid2_u05sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_cospid2_u05sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_ldexpd2_sse4(__m128d, __m128i);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_ldexpd2_sse4(__m128d, __m128i);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_ilogbd2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_cinz_ilogbd2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmad2_sse4(__m128d, __m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fmad2_sse4(__m128d, __m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sqrtd2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_u05sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sqrtd2_u05sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_sqrtd2_u35sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_hypotd2_u05sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_hypotd2_u05sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_hypotd2_u35sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_hypotd2_u35sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fabsd2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fabsd2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_copysignd2_sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_copysignd2_sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmaxd2_sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fmaxd2_sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmind2_sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fmind2_sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fdimd2_sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fdimd2_sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_truncd2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_truncd2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_floord2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_floord2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_ceild2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_ceild2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_roundd2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_roundd2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_rintd2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_rintd2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_nextafterd2_sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_nextafterd2_sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_frfrexpd2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_frfrexpd2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_expfrexpd2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_cinz_expfrexpd2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmodd2_sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_fmodd2_sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_remainderd2_sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_remainderd2_sse4(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_modfd2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_cinz_modfd2_sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_lgammad2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_lgammad2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tgammad2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_tgammad2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_erfd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_erfd2_u10sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_erfcd2_u15sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cinz_erfcd2_u15sse4(__m128d);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd2_sse4(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd2_sse4(int);
+
+#ifndef Sleef___m128_2_DEFINED
+typedef struct {
+  __m128 x, y;
+} Sleef___m128_2;
+#define Sleef___m128_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sinf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cosf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_cosf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincosf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_cinz_sincosf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_tanf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_asinf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acosf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_acosf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_atanf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atan2f4_u35sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_atan2f4_u35sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_logf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_logf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cbrtf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_cbrtf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sinf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cosf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_cosf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincosf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_cinz_sincosf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_tanf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_asinf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acosf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_acosf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_atanf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atan2f4_u10sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_atan2f4_u10sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_logf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_logf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cbrtf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_cbrtf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_expf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_expf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_powf4_u10sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_powf4_u10sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinhf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sinhf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_coshf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_coshf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanhf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_tanhf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinhf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sinhf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_coshf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_coshf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanhf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_tanhf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastsinf4_u3500sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fastsinf4_u3500sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastcosf4_u3500sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fastcosf4_u3500sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastpowf4_u3500sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fastpowf4_u3500sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinhf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_asinhf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acoshf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_acoshf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanhf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_atanhf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp2f4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_exp2f4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp2f4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_exp2f4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp10f4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_exp10f4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp10f4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_exp10f4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_expm1f4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_expm1f4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log10f4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_log10f4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log2f4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_log2f4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log2f4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_log2f4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log1pf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_log1pf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincospif4_u05sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_cinz_sincospif4_u05sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincospif4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_cinz_sincospif4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinpif4_u05sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sinpif4_u05sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cospif4_u05sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_cospif4_u05sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmaf4_sse4(__m128, __m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fmaf4_sse4(__m128, __m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sqrtf4_sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_u05sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sqrtf4_u05sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_sqrtf4_u35sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_hypotf4_u05sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_hypotf4_u05sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_hypotf4_u35sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_hypotf4_u35sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fabsf4_sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fabsf4_sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_copysignf4_sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_copysignf4_sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmaxf4_sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fmaxf4_sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fminf4_sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fminf4_sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fdimf4_sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fdimf4_sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_truncf4_sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_truncf4_sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_floorf4_sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_floorf4_sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_ceilf4_sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_ceilf4_sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_roundf4_sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_roundf4_sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_rintf4_sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_rintf4_sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_nextafterf4_sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_nextafterf4_sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_frfrexpf4_sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_frfrexpf4_sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmodf4_sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_fmodf4_sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_remainderf4_sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_remainderf4_sse4(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_modff4_sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_cinz_modff4_sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_lgammaf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_lgammaf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tgammaf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_tgammaf4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_erff4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_erff4_u10sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_erfcf4_u15sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cinz_erfcf4_u15sse4(__m128);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf4_sse4(int);
+SLEEF_IMPORT SLEEF_CONST int Sleef_cinz_getIntf4_sse4(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf4_sse4(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_cinz_getPtrf4_sse4(int);
+#endif
+#ifdef __AVX__
+
+#ifndef Sleef___m256d_2_DEFINED
+typedef struct {
+  __m256d x, y;
+} Sleef___m256d_2;
+#define Sleef___m256d_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sind4_u35(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cosd4_u35(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincosd4_u35(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tand4_u35(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asind4_u35(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acosd4_u35(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atand4_u35(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atan2d4_u35(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_logd4_u35(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cbrtd4_u35(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sind4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cosd4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincosd4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tand4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asind4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acosd4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atand4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atan2d4_u10(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_logd4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cbrtd4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_expd4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_powd4_u10(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinhd4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_coshd4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tanhd4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinhd4_u35(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_coshd4_u35(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tanhd4_u35(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastsind4_u3500(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastcosd4_u3500(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastpowd4_u3500(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asinhd4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acoshd4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atanhd4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp2d4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp2d4_u35(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp10d4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp10d4_u35(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_expm1d4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log10d4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log2d4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log2d4_u35(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log1pd4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincospid4_u05(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincospid4_u35(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinpid4_u05(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cospid4_u05(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_ldexpd4(__m256d, __m128i);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_ilogbd4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmad4(__m256d, __m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_u05(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_u35(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_hypotd4_u05(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_hypotd4_u35(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fabsd4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_copysignd4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmaxd4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmind4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fdimd4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_truncd4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_floord4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_ceild4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_roundd4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_rintd4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_nextafterd4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_frfrexpd4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_expfrexpd4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmodd4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_remainderd4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_modfd4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_lgammad4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tgammad4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_erfd4_u10(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_erfcd4_u15(__m256d);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd4(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd4(int);
+
+#ifndef Sleef___m256_2_DEFINED
+typedef struct {
+  __m256 x, y;
+} Sleef___m256_2;
+#define Sleef___m256_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinf8_u35(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cosf8_u35(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincosf8_u35(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanf8_u35(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinf8_u35(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acosf8_u35(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanf8_u35(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atan2f8_u35(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_logf8_u35(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cbrtf8_u35(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinf8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cosf8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincosf8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanf8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinf8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acosf8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanf8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atan2f8_u10(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_logf8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cbrtf8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_expf8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_powf8_u10(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinhf8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_coshf8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanhf8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinhf8_u35(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_coshf8_u35(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanhf8_u35(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastsinf8_u3500(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastcosf8_u3500(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastpowf8_u3500(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinhf8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acoshf8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanhf8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp2f8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp2f8_u35(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp10f8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp10f8_u35(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_expm1f8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log10f8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log2f8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log2f8_u35(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log1pf8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincospif8_u05(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincospif8_u35(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinpif8_u05(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cospif8_u05(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmaf8(__m256, __m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_u05(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_u35(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_hypotf8_u05(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_hypotf8_u35(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fabsf8(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_copysignf8(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmaxf8(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fminf8(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fdimf8(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_truncf8(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_floorf8(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_ceilf8(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_roundf8(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_rintf8(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_nextafterf8(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_frfrexpf8(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmodf8(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_remainderf8(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_modff8(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_lgammaf8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tgammaf8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_erff8_u10(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_erfcf8_u15(__m256);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf8(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf8(int);
+#endif
+#ifdef __AVX__
+
+#ifndef Sleef___m256d_2_DEFINED
+typedef struct {
+  __m256d x, y;
+} Sleef___m256d_2;
+#define Sleef___m256d_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sind4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_sind4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cosd4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_cosd4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincosd4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_cinz_sincosd4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tand4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_tand4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asind4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_asind4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acosd4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_acosd4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atand4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_atand4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atan2d4_u35avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_atan2d4_u35avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_logd4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_logd4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cbrtd4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_cbrtd4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sind4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_sind4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cosd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_cosd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincosd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_cinz_sincosd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tand4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_tand4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asind4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_asind4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acosd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_acosd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atand4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_atand4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atan2d4_u10avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_atan2d4_u10avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_logd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_logd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cbrtd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_cbrtd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_expd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_expd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_powd4_u10avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_powd4_u10avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinhd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_sinhd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_coshd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_coshd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tanhd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_tanhd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinhd4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_sinhd4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_coshd4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_coshd4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tanhd4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_tanhd4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastsind4_u3500avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_fastsind4_u3500avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastcosd4_u3500avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_fastcosd4_u3500avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastpowd4_u3500avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_fastpowd4_u3500avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asinhd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_asinhd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acoshd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_acoshd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atanhd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_atanhd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp2d4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_exp2d4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp2d4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_exp2d4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp10d4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_exp10d4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp10d4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_exp10d4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_expm1d4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_expm1d4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log10d4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_log10d4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log2d4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_log2d4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log2d4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_log2d4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log1pd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_log1pd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincospid4_u05avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_cinz_sincospid4_u05avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincospid4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_cinz_sincospid4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinpid4_u05avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_sinpid4_u05avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cospid4_u05avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_cospid4_u05avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_ldexpd4_avx(__m256d, __m128i);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_ldexpd4_avx(__m256d, __m128i);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_ilogbd4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_cinz_ilogbd4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmad4_avx(__m256d, __m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_fmad4_avx(__m256d, __m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_sqrtd4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_u05avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_sqrtd4_u05avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_sqrtd4_u35avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_hypotd4_u05avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_hypotd4_u05avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_hypotd4_u35avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_hypotd4_u35avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fabsd4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_fabsd4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_copysignd4_avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_copysignd4_avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmaxd4_avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_fmaxd4_avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmind4_avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_fmind4_avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fdimd4_avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_fdimd4_avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_truncd4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_truncd4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_floord4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_floord4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_ceild4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_ceild4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_roundd4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_roundd4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_rintd4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_rintd4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_nextafterd4_avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_nextafterd4_avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_frfrexpd4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_frfrexpd4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_expfrexpd4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_cinz_expfrexpd4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmodd4_avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_fmodd4_avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_remainderd4_avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_remainderd4_avx(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_modfd4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_cinz_modfd4_avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_lgammad4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_lgammad4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tgammad4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_tgammad4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_erfd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_erfd4_u10avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_erfcd4_u15avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cinz_erfcd4_u15avx(__m256d);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd4_avx(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd4_avx(int);
+
+#ifndef Sleef___m256_2_DEFINED
+typedef struct {
+  __m256 x, y;
+} Sleef___m256_2;
+#define Sleef___m256_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_sinf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cosf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_cosf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincosf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_cinz_sincosf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_tanf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_asinf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acosf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_acosf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_atanf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atan2f8_u35avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_atan2f8_u35avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_logf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_logf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cbrtf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_cbrtf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_sinf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cosf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_cosf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincosf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_cinz_sincosf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_tanf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_asinf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acosf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_acosf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_atanf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atan2f8_u10avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_atan2f8_u10avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_logf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_logf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cbrtf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_cbrtf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_expf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_expf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_powf8_u10avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_powf8_u10avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinhf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_sinhf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_coshf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_coshf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanhf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_tanhf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinhf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_sinhf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_coshf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_coshf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanhf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_tanhf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastsinf8_u3500avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_fastsinf8_u3500avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastcosf8_u3500avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_fastcosf8_u3500avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastpowf8_u3500avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_fastpowf8_u3500avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinhf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_asinhf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acoshf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_acoshf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanhf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_atanhf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp2f8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_exp2f8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp2f8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_exp2f8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp10f8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_exp10f8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp10f8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_exp10f8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_expm1f8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_expm1f8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log10f8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_log10f8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log2f8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_log2f8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log2f8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_log2f8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log1pf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_log1pf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincospif8_u05avx(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_cinz_sincospif8_u05avx(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincospif8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_cinz_sincospif8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinpif8_u05avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_sinpif8_u05avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cospif8_u05avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_cospif8_u05avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmaf8_avx(__m256, __m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_fmaf8_avx(__m256, __m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_sqrtf8_avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_u05avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_sqrtf8_u05avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_sqrtf8_u35avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_hypotf8_u05avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_hypotf8_u05avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_hypotf8_u35avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_hypotf8_u35avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fabsf8_avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_fabsf8_avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_copysignf8_avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_copysignf8_avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmaxf8_avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_fmaxf8_avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fminf8_avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_fminf8_avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fdimf8_avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_fdimf8_avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_truncf8_avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_truncf8_avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_floorf8_avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_floorf8_avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_ceilf8_avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_ceilf8_avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_roundf8_avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_roundf8_avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_rintf8_avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_rintf8_avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_nextafterf8_avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_nextafterf8_avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_frfrexpf8_avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_frfrexpf8_avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmodf8_avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_fmodf8_avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_remainderf8_avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_remainderf8_avx(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_modff8_avx(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_cinz_modff8_avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_lgammaf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_lgammaf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tgammaf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_tgammaf8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_erff8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_erff8_u10avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_erfcf8_u15avx(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cinz_erfcf8_u15avx(__m256);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf8_avx(int);
+SLEEF_IMPORT SLEEF_CONST int Sleef_cinz_getIntf8_avx(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf8_avx(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_cinz_getPtrf8_avx(int);
+#endif
+#ifdef __AVX__
+
+#ifndef Sleef___m256d_2_DEFINED
+typedef struct {
+  __m256d x, y;
+} Sleef___m256d_2;
+#define Sleef___m256d_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sind4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sind4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cosd4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_cosd4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincosd4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_finz_sincosd4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tand4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_tand4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asind4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_asind4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acosd4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_acosd4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atand4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_atand4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atan2d4_u35fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_atan2d4_u35fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_logd4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_logd4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cbrtd4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_cbrtd4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sind4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sind4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cosd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_cosd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincosd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_finz_sincosd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tand4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_tand4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asind4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_asind4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acosd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_acosd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atand4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_atand4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atan2d4_u10fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_atan2d4_u10fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_logd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_logd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cbrtd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_cbrtd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_expd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_expd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_powd4_u10fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_powd4_u10fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinhd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sinhd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_coshd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_coshd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tanhd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_tanhd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinhd4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sinhd4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_coshd4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_coshd4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tanhd4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_tanhd4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastsind4_u3500fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fastsind4_u3500fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastcosd4_u3500fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fastcosd4_u3500fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastpowd4_u3500fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fastpowd4_u3500fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asinhd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_asinhd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acoshd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_acoshd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atanhd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_atanhd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp2d4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_exp2d4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp2d4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_exp2d4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp10d4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_exp10d4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp10d4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_exp10d4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_expm1d4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_expm1d4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log10d4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_log10d4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log2d4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_log2d4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log2d4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_log2d4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log1pd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_log1pd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincospid4_u05fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_finz_sincospid4_u05fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincospid4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_finz_sincospid4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinpid4_u05fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sinpid4_u05fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cospid4_u05fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_cospid4_u05fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_ldexpd4_fma4(__m256d, __m128i);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_ldexpd4_fma4(__m256d, __m128i);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_ilogbd4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_finz_ilogbd4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmad4_fma4(__m256d, __m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fmad4_fma4(__m256d, __m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sqrtd4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_u05fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sqrtd4_u05fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sqrtd4_u35fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_hypotd4_u05fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_hypotd4_u05fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_hypotd4_u35fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_hypotd4_u35fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fabsd4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fabsd4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_copysignd4_fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_copysignd4_fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmaxd4_fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fmaxd4_fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmind4_fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fmind4_fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fdimd4_fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fdimd4_fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_truncd4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_truncd4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_floord4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_floord4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_ceild4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_ceild4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_roundd4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_roundd4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_rintd4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_rintd4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_nextafterd4_fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_nextafterd4_fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_frfrexpd4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_frfrexpd4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_expfrexpd4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_finz_expfrexpd4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmodd4_fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fmodd4_fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_remainderd4_fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_remainderd4_fma4(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_modfd4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_finz_modfd4_fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_lgammad4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_lgammad4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tgammad4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_tgammad4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_erfd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_erfd4_u10fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_erfcd4_u15fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_erfcd4_u15fma4(__m256d);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd4_fma4(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd4_fma4(int);
+
+#ifndef Sleef___m256_2_DEFINED
+typedef struct {
+  __m256 x, y;
+} Sleef___m256_2;
+#define Sleef___m256_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sinf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cosf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_cosf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincosf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_finz_sincosf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_tanf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_asinf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acosf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_acosf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_atanf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atan2f8_u35fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_atan2f8_u35fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_logf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_logf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cbrtf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_cbrtf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sinf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cosf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_cosf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincosf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_finz_sincosf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_tanf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_asinf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acosf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_acosf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_atanf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atan2f8_u10fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_atan2f8_u10fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_logf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_logf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cbrtf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_cbrtf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_expf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_expf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_powf8_u10fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_powf8_u10fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinhf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sinhf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_coshf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_coshf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanhf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_tanhf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinhf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sinhf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_coshf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_coshf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanhf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_tanhf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastsinf8_u3500fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fastsinf8_u3500fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastcosf8_u3500fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fastcosf8_u3500fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastpowf8_u3500fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fastpowf8_u3500fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinhf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_asinhf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acoshf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_acoshf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanhf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_atanhf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp2f8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_exp2f8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp2f8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_exp2f8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp10f8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_exp10f8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp10f8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_exp10f8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_expm1f8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_expm1f8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log10f8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_log10f8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log2f8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_log2f8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log2f8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_log2f8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log1pf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_log1pf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincospif8_u05fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_finz_sincospif8_u05fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincospif8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_finz_sincospif8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinpif8_u05fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sinpif8_u05fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cospif8_u05fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_cospif8_u05fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmaf8_fma4(__m256, __m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fmaf8_fma4(__m256, __m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sqrtf8_fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_u05fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sqrtf8_u05fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sqrtf8_u35fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_hypotf8_u05fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_hypotf8_u05fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_hypotf8_u35fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_hypotf8_u35fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fabsf8_fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fabsf8_fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_copysignf8_fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_copysignf8_fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmaxf8_fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fmaxf8_fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fminf8_fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fminf8_fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fdimf8_fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fdimf8_fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_truncf8_fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_truncf8_fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_floorf8_fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_floorf8_fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_ceilf8_fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_ceilf8_fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_roundf8_fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_roundf8_fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_rintf8_fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_rintf8_fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_nextafterf8_fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_nextafterf8_fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_frfrexpf8_fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_frfrexpf8_fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmodf8_fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fmodf8_fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_remainderf8_fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_remainderf8_fma4(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_modff8_fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_finz_modff8_fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_lgammaf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_lgammaf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tgammaf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_tgammaf8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_erff8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_erff8_u10fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_erfcf8_u15fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_erfcf8_u15fma4(__m256);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf8_fma4(int);
+SLEEF_IMPORT SLEEF_CONST int Sleef_finz_getIntf8_fma4(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf8_fma4(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_finz_getPtrf8_fma4(int);
+#endif
+#ifdef __AVX__
+
+#ifndef Sleef___m256d_2_DEFINED
+typedef struct {
+  __m256d x, y;
+} Sleef___m256d_2;
+#define Sleef___m256d_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sind4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sind4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cosd4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_cosd4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincosd4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_finz_sincosd4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tand4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_tand4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asind4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_asind4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acosd4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_acosd4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atand4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_atand4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atan2d4_u35avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_atan2d4_u35avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_logd4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_logd4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cbrtd4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_cbrtd4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sind4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sind4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cosd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_cosd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincosd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_finz_sincosd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tand4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_tand4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asind4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_asind4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acosd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_acosd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atand4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_atand4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atan2d4_u10avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_atan2d4_u10avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_logd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_logd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cbrtd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_cbrtd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_expd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_expd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_powd4_u10avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_powd4_u10avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinhd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sinhd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_coshd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_coshd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tanhd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_tanhd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinhd4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sinhd4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_coshd4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_coshd4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tanhd4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_tanhd4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastsind4_u3500avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fastsind4_u3500avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastcosd4_u3500avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fastcosd4_u3500avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fastpowd4_u3500avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fastpowd4_u3500avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_asinhd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_asinhd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_acoshd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_acoshd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_atanhd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_atanhd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp2d4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_exp2d4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp2d4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_exp2d4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp10d4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_exp10d4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_exp10d4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_exp10d4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_expm1d4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_expm1d4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log10d4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_log10d4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log2d4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_log2d4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log2d4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_log2d4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_log1pd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_log1pd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincospid4_u05avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_finz_sincospid4_u05avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_sincospid4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_finz_sincospid4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sinpid4_u05avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sinpid4_u05avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_cospid4_u05avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_cospid4_u05avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_ldexpd4_avx2(__m256d, __m128i);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_ldexpd4_avx2(__m256d, __m128i);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_ilogbd4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_finz_ilogbd4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmad4_avx2(__m256d, __m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fmad4_avx2(__m256d, __m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sqrtd4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_u05avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sqrtd4_u05avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_sqrtd4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_sqrtd4_u35avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_hypotd4_u05avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_hypotd4_u05avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_hypotd4_u35avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_hypotd4_u35avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fabsd4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fabsd4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_copysignd4_avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_copysignd4_avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmaxd4_avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fmaxd4_avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmind4_avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fmind4_avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fdimd4_avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fdimd4_avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_truncd4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_truncd4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_floord4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_floord4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_ceild4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_ceild4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_roundd4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_roundd4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_rintd4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_rintd4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_nextafterd4_avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_nextafterd4_avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_frfrexpd4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_frfrexpd4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_expfrexpd4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_finz_expfrexpd4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_fmodd4_avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_fmodd4_avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_remainderd4_avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_remainderd4_avx2(__m256d, __m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_modfd4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256d_2 Sleef_finz_modfd4_avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_lgammad4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_lgammad4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_tgammad4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_tgammad4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_erfd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_erfd4_u10avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_erfcd4_u15avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST __m256d Sleef_finz_erfcd4_u15avx2(__m256d);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd4_avx2(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd4_avx2(int);
+
+#ifndef Sleef___m256_2_DEFINED
+typedef struct {
+  __m256 x, y;
+} Sleef___m256_2;
+#define Sleef___m256_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sinf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cosf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_cosf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincosf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_finz_sincosf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_tanf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_asinf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acosf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_acosf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_atanf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atan2f8_u35avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_atan2f8_u35avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_logf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_logf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cbrtf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_cbrtf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sinf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cosf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_cosf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincosf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_finz_sincosf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_tanf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_asinf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acosf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_acosf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_atanf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atan2f8_u10avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_atan2f8_u10avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_logf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_logf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cbrtf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_cbrtf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_expf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_expf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_powf8_u10avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_powf8_u10avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinhf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sinhf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_coshf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_coshf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanhf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_tanhf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinhf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sinhf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_coshf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_coshf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tanhf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_tanhf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastsinf8_u3500avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fastsinf8_u3500avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastcosf8_u3500avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fastcosf8_u3500avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fastpowf8_u3500avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fastpowf8_u3500avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_asinhf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_asinhf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_acoshf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_acoshf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_atanhf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_atanhf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp2f8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_exp2f8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp2f8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_exp2f8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp10f8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_exp10f8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_exp10f8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_exp10f8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_expm1f8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_expm1f8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log10f8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_log10f8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log2f8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_log2f8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log2f8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_log2f8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_log1pf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_log1pf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincospif8_u05avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_finz_sincospif8_u05avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_sincospif8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_finz_sincospif8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sinpif8_u05avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sinpif8_u05avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_cospif8_u05avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_cospif8_u05avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmaf8_avx2(__m256, __m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fmaf8_avx2(__m256, __m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sqrtf8_avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_u05avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sqrtf8_u05avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_sqrtf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_sqrtf8_u35avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_hypotf8_u05avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_hypotf8_u05avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_hypotf8_u35avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_hypotf8_u35avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fabsf8_avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fabsf8_avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_copysignf8_avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_copysignf8_avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmaxf8_avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fmaxf8_avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fminf8_avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fminf8_avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fdimf8_avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fdimf8_avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_truncf8_avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_truncf8_avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_floorf8_avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_floorf8_avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_ceilf8_avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_ceilf8_avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_roundf8_avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_roundf8_avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_rintf8_avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_rintf8_avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_nextafterf8_avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_nextafterf8_avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_frfrexpf8_avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_frfrexpf8_avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_fmodf8_avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_fmodf8_avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_remainderf8_avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_remainderf8_avx2(__m256, __m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_modff8_avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST Sleef___m256_2 Sleef_finz_modff8_avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_lgammaf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_lgammaf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_tgammaf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_tgammaf8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_erff8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_erff8_u10avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_erfcf8_u15avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST __m256 Sleef_finz_erfcf8_u15avx2(__m256);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf8_avx2(int);
+SLEEF_IMPORT SLEEF_CONST int Sleef_finz_getIntf8_avx2(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf8_avx2(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_finz_getPtrf8_avx2(int);
+#endif
+#ifdef __SSE2__
+
+#ifndef Sleef___m128d_2_DEFINED
+typedef struct {
+  __m128d x, y;
+} Sleef___m128d_2;
+#define Sleef___m128d_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sind2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_sind2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cosd2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_cosd2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincosd2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_finz_sincosd2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tand2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_tand2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asind2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_asind2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acosd2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_acosd2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atand2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_atand2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atan2d2_u35avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_atan2d2_u35avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_logd2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_logd2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cbrtd2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_cbrtd2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sind2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_sind2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cosd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_cosd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincosd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_finz_sincosd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tand2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_tand2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asind2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_asind2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acosd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_acosd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atand2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_atand2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atan2d2_u10avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_atan2d2_u10avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_logd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_logd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cbrtd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_cbrtd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_expd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_expd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_powd2_u10avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_powd2_u10avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinhd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_sinhd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_coshd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_coshd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tanhd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_tanhd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinhd2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_sinhd2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_coshd2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_coshd2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tanhd2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_tanhd2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastsind2_u3500avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_fastsind2_u3500avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastcosd2_u3500avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_fastcosd2_u3500avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fastpowd2_u3500avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_fastpowd2_u3500avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_asinhd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_asinhd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_acoshd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_acoshd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_atanhd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_atanhd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp2d2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_exp2d2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp2d2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_exp2d2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp10d2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_exp10d2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_exp10d2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_exp10d2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_expm1d2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_expm1d2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log10d2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_log10d2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log2d2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_log2d2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log2d2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_log2d2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_log1pd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_log1pd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincospid2_u05avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_finz_sincospid2_u05avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_sincospid2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_finz_sincospid2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sinpid2_u05avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_sinpid2_u05avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_cospid2_u05avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_cospid2_u05avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_ldexpd2_avx2128(__m128d, __m128i);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_ldexpd2_avx2128(__m128d, __m128i);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_ilogbd2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_finz_ilogbd2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmad2_avx2128(__m128d, __m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_fmad2_avx2128(__m128d, __m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_sqrtd2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_u05avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_sqrtd2_u05avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_sqrtd2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_sqrtd2_u35avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_hypotd2_u05avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_hypotd2_u05avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_hypotd2_u35avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_hypotd2_u35avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fabsd2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_fabsd2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_copysignd2_avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_copysignd2_avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmaxd2_avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_fmaxd2_avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmind2_avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_fmind2_avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fdimd2_avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_fdimd2_avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_truncd2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_truncd2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_floord2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_floord2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_ceild2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_ceild2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_roundd2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_roundd2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_rintd2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_rintd2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_nextafterd2_avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_nextafterd2_avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_frfrexpd2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_frfrexpd2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_expfrexpd2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128i Sleef_finz_expfrexpd2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_fmodd2_avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_fmodd2_avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_remainderd2_avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_remainderd2_avx2128(__m128d, __m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_modfd2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128d_2 Sleef_finz_modfd2_avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_lgammad2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_lgammad2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_tgammad2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_tgammad2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_erfd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_erfd2_u10avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_erfcd2_u15avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST __m128d Sleef_finz_erfcd2_u15avx2128(__m128d);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd2_avx2128(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd2_avx2128(int);
+
+#ifndef Sleef___m128_2_DEFINED
+typedef struct {
+  __m128 x, y;
+} Sleef___m128_2;
+#define Sleef___m128_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_sinf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cosf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_cosf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincosf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_finz_sincosf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_tanf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_asinf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acosf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_acosf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_atanf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atan2f4_u35avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_atan2f4_u35avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_logf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_logf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cbrtf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_cbrtf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_sinf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cosf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_cosf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincosf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_finz_sincosf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_tanf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_asinf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acosf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_acosf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_atanf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atan2f4_u10avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_atan2f4_u10avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_logf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_logf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cbrtf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_cbrtf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_expf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_expf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_powf4_u10avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_powf4_u10avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinhf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_sinhf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_coshf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_coshf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanhf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_tanhf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinhf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_sinhf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_coshf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_coshf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tanhf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_tanhf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastsinf4_u3500avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_fastsinf4_u3500avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastcosf4_u3500avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_fastcosf4_u3500avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fastpowf4_u3500avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_fastpowf4_u3500avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_asinhf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_asinhf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_acoshf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_acoshf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_atanhf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_atanhf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp2f4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_exp2f4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp2f4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_exp2f4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp10f4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_exp10f4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_exp10f4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_exp10f4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_expm1f4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_expm1f4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log10f4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_log10f4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log2f4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_log2f4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log2f4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_log2f4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_log1pf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_log1pf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincospif4_u05avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_finz_sincospif4_u05avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_sincospif4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_finz_sincospif4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sinpif4_u05avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_sinpif4_u05avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_cospif4_u05avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_cospif4_u05avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmaf4_avx2128(__m128, __m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_fmaf4_avx2128(__m128, __m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_sqrtf4_avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_u05avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_sqrtf4_u05avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_sqrtf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_sqrtf4_u35avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_hypotf4_u05avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_hypotf4_u05avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_hypotf4_u35avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_hypotf4_u35avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fabsf4_avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_fabsf4_avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_copysignf4_avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_copysignf4_avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmaxf4_avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_fmaxf4_avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fminf4_avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_fminf4_avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fdimf4_avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_fdimf4_avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_truncf4_avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_truncf4_avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_floorf4_avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_floorf4_avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_ceilf4_avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_ceilf4_avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_roundf4_avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_roundf4_avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_rintf4_avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_rintf4_avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_nextafterf4_avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_nextafterf4_avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_frfrexpf4_avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_frfrexpf4_avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_fmodf4_avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_fmodf4_avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_remainderf4_avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_remainderf4_avx2128(__m128, __m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_modff4_avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST Sleef___m128_2 Sleef_finz_modff4_avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_lgammaf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_lgammaf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_tgammaf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_tgammaf4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_erff4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_erff4_u10avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_erfcf4_u15avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST __m128 Sleef_finz_erfcf4_u15avx2128(__m128);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf4_avx2128(int);
+SLEEF_IMPORT SLEEF_CONST int Sleef_finz_getIntf4_avx2128(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf4_avx2128(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_finz_getPtrf4_avx2128(int);
+#endif
+#ifdef __AVX512F__
+
+#ifndef Sleef___m512d_2_DEFINED
+typedef struct {
+  __m512d x, y;
+} Sleef___m512d_2;
+#define Sleef___m512d_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sind8_u35(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cosd8_u35(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincosd8_u35(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tand8_u35(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_asind8_u35(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_acosd8_u35(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atand8_u35(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atan2d8_u35(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_logd8_u35(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cbrtd8_u35(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sind8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cosd8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincosd8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tand8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_asind8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_acosd8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atand8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atan2d8_u10(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_logd8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cbrtd8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_expd8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_powd8_u10(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sinhd8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_coshd8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tanhd8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sinhd8_u35(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_coshd8_u35(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tanhd8_u35(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fastsind8_u3500(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fastcosd8_u3500(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fastpowd8_u3500(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_asinhd8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_acoshd8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atanhd8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp2d8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp2d8_u35(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp10d8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp10d8_u35(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_expm1d8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log10d8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log2d8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log2d8_u35(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log1pd8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincospid8_u05(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincospid8_u35(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sinpid8_u05(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cospid8_u05(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_ldexpd8(__m512d, __m256i);
+SLEEF_IMPORT SLEEF_CONST __m256i Sleef_ilogbd8(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmad8(__m512d, __m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sqrtd8(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sqrtd8_u05(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sqrtd8_u35(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_hypotd8_u05(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_hypotd8_u35(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fabsd8(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_copysignd8(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmaxd8(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmind8(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fdimd8(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_truncd8(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_floord8(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_ceild8(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_roundd8(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_rintd8(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_nextafterd8(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_frfrexpd8(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m256i Sleef_expfrexpd8(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmodd8(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_remainderd8(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_modfd8(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_lgammad8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tgammad8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_erfd8_u10(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_erfcd8_u15(__m512d);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd8(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd8(int);
+
+#ifndef Sleef___m512_2_DEFINED
+typedef struct {
+  __m512 x, y;
+} Sleef___m512_2;
+#define Sleef___m512_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinf16_u35(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cosf16_u35(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincosf16_u35(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanf16_u35(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_asinf16_u35(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_acosf16_u35(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atanf16_u35(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atan2f16_u35(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_logf16_u35(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cbrtf16_u35(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinf16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cosf16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincosf16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanf16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_asinf16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_acosf16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atanf16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atan2f16_u10(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_logf16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cbrtf16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_expf16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_powf16_u10(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinhf16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_coshf16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanhf16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinhf16_u35(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_coshf16_u35(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanhf16_u35(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fastsinf16_u3500(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fastcosf16_u3500(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fastpowf16_u3500(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_asinhf16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_acoshf16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atanhf16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp2f16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp2f16_u35(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp10f16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp10f16_u35(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_expm1f16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log10f16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log2f16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log2f16_u35(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log1pf16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincospif16_u05(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincospif16_u35(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinpif16_u05(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cospif16_u05(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fmaf16(__m512, __m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sqrtf16(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sqrtf16_u05(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sqrtf16_u35(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_hypotf16_u05(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_hypotf16_u35(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fabsf16(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_copysignf16(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fmaxf16(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fminf16(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fdimf16(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_truncf16(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_floorf16(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_ceilf16(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_roundf16(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_rintf16(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_nextafterf16(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_frfrexpf16(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fmodf16(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_remainderf16(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_modff16(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_lgammaf16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tgammaf16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_erff16_u10(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_erfcf16_u15(__m512);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf16(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf16(int);
+#endif
+#ifdef __AVX512F__
+
+#ifndef Sleef___m512d_2_DEFINED
+typedef struct {
+  __m512d x, y;
+} Sleef___m512d_2;
+#define Sleef___m512d_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sind8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_sind8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cosd8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_cosd8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincosd8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_finz_sincosd8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tand8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_tand8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_asind8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_asind8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_acosd8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_acosd8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atand8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_atand8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atan2d8_u35avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_atan2d8_u35avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_logd8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_logd8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cbrtd8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_cbrtd8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sind8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_sind8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cosd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_cosd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincosd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_finz_sincosd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tand8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_tand8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_asind8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_asind8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_acosd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_acosd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atand8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_atand8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atan2d8_u10avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_atan2d8_u10avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_logd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_logd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cbrtd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_cbrtd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_expd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_expd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_powd8_u10avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_powd8_u10avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sinhd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_sinhd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_coshd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_coshd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tanhd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_tanhd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sinhd8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_sinhd8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_coshd8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_coshd8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tanhd8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_tanhd8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fastsind8_u3500avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_fastsind8_u3500avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fastcosd8_u3500avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_fastcosd8_u3500avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fastpowd8_u3500avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_fastpowd8_u3500avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_asinhd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_asinhd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_acoshd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_acoshd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atanhd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_atanhd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp2d8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_exp2d8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp2d8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_exp2d8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp10d8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_exp10d8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp10d8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_exp10d8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_expm1d8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_expm1d8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log10d8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_log10d8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log2d8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_log2d8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log2d8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_log2d8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log1pd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_log1pd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincospid8_u05avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_finz_sincospid8_u05avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincospid8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_finz_sincospid8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sinpid8_u05avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_sinpid8_u05avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cospid8_u05avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_cospid8_u05avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_ldexpd8_avx512f(__m512d, __m256i);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_ldexpd8_avx512f(__m512d, __m256i);
+SLEEF_IMPORT SLEEF_CONST __m256i Sleef_ilogbd8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m256i Sleef_finz_ilogbd8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmad8_avx512f(__m512d, __m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_fmad8_avx512f(__m512d, __m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sqrtd8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_sqrtd8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sqrtd8_u05avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_sqrtd8_u05avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sqrtd8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_sqrtd8_u35avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_hypotd8_u05avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_hypotd8_u05avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_hypotd8_u35avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_hypotd8_u35avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fabsd8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_fabsd8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_copysignd8_avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_copysignd8_avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmaxd8_avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_fmaxd8_avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmind8_avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_fmind8_avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fdimd8_avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_fdimd8_avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_truncd8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_truncd8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_floord8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_floord8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_ceild8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_ceild8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_roundd8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_roundd8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_rintd8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_rintd8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_nextafterd8_avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_nextafterd8_avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_frfrexpd8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_frfrexpd8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m256i Sleef_expfrexpd8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m256i Sleef_finz_expfrexpd8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmodd8_avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_fmodd8_avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_remainderd8_avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_remainderd8_avx512f(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_modfd8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_finz_modfd8_avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_lgammad8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_lgammad8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tgammad8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_tgammad8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_erfd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_erfd8_u10avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_erfcd8_u15avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_finz_erfcd8_u15avx512f(__m512d);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd8_avx512f(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd8_avx512f(int);
+
+#ifndef Sleef___m512_2_DEFINED
+typedef struct {
+  __m512 x, y;
+} Sleef___m512_2;
+#define Sleef___m512_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_sinf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cosf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_cosf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincosf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_finz_sincosf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_tanf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_asinf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_asinf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_acosf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_acosf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atanf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_atanf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atan2f16_u35avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_atan2f16_u35avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_logf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_logf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cbrtf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_cbrtf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_sinf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cosf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_cosf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincosf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_finz_sincosf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_tanf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_asinf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_asinf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_acosf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_acosf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atanf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_atanf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atan2f16_u10avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_atan2f16_u10avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_logf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_logf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cbrtf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_cbrtf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_expf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_expf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_powf16_u10avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_powf16_u10avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinhf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_sinhf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_coshf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_coshf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanhf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_tanhf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinhf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_sinhf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_coshf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_coshf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanhf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_tanhf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fastsinf16_u3500avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_fastsinf16_u3500avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fastcosf16_u3500avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_fastcosf16_u3500avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fastpowf16_u3500avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_fastpowf16_u3500avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_asinhf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_asinhf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_acoshf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_acoshf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atanhf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_atanhf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp2f16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_exp2f16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp2f16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_exp2f16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp10f16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_exp10f16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp10f16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_exp10f16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_expm1f16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_expm1f16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log10f16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_log10f16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log2f16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_log2f16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log2f16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_log2f16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log1pf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_log1pf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincospif16_u05avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_finz_sincospif16_u05avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincospif16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_finz_sincospif16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinpif16_u05avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_sinpif16_u05avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cospif16_u05avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_cospif16_u05avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fmaf16_avx512f(__m512, __m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_fmaf16_avx512f(__m512, __m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sqrtf16_avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_sqrtf16_avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sqrtf16_u05avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_sqrtf16_u05avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sqrtf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_sqrtf16_u35avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_hypotf16_u05avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_hypotf16_u05avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_hypotf16_u35avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_hypotf16_u35avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fabsf16_avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_fabsf16_avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_copysignf16_avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_copysignf16_avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fmaxf16_avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_fmaxf16_avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fminf16_avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_fminf16_avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fdimf16_avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_fdimf16_avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_truncf16_avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_truncf16_avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_floorf16_avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_floorf16_avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_ceilf16_avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_ceilf16_avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_roundf16_avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_roundf16_avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_rintf16_avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_rintf16_avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_nextafterf16_avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_nextafterf16_avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_frfrexpf16_avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_frfrexpf16_avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fmodf16_avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_fmodf16_avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_remainderf16_avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_remainderf16_avx512f(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_modff16_avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_finz_modff16_avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_lgammaf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_lgammaf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tgammaf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_tgammaf16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_erff16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_erff16_u10avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_erfcf16_u15avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_finz_erfcf16_u15avx512f(__m512);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf16_avx512f(int);
+SLEEF_IMPORT SLEEF_CONST int Sleef_finz_getIntf16_avx512f(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf16_avx512f(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_finz_getPtrf16_avx512f(int);
+#endif
+#ifdef __AVX512F__
+
+#ifndef Sleef___m512d_2_DEFINED
+typedef struct {
+  __m512d x, y;
+} Sleef___m512d_2;
+#define Sleef___m512d_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sind8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_sind8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cosd8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_cosd8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincosd8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_cinz_sincosd8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tand8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_tand8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_asind8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_asind8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_acosd8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_acosd8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atand8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_atand8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atan2d8_u35avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_atan2d8_u35avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_logd8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_logd8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cbrtd8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_cbrtd8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sind8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_sind8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cosd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_cosd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincosd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_cinz_sincosd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tand8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_tand8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_asind8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_asind8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_acosd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_acosd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atand8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_atand8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atan2d8_u10avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_atan2d8_u10avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_logd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_logd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cbrtd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_cbrtd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_expd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_expd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_powd8_u10avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_powd8_u10avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sinhd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_sinhd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_coshd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_coshd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tanhd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_tanhd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sinhd8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_sinhd8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_coshd8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_coshd8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tanhd8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_tanhd8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fastsind8_u3500avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_fastsind8_u3500avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fastcosd8_u3500avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_fastcosd8_u3500avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fastpowd8_u3500avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_fastpowd8_u3500avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_asinhd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_asinhd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_acoshd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_acoshd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_atanhd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_atanhd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp2d8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_exp2d8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp2d8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_exp2d8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp10d8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_exp10d8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_exp10d8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_exp10d8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_expm1d8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_expm1d8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log10d8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_log10d8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log2d8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_log2d8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log2d8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_log2d8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_log1pd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_log1pd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincospid8_u05avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_cinz_sincospid8_u05avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_sincospid8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_cinz_sincospid8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sinpid8_u05avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_sinpid8_u05avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cospid8_u05avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_cospid8_u05avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_ldexpd8_avx512fnofma(__m512d, __m256i);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_ldexpd8_avx512fnofma(__m512d, __m256i);
+SLEEF_IMPORT SLEEF_CONST __m256i Sleef_ilogbd8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m256i Sleef_cinz_ilogbd8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmad8_avx512fnofma(__m512d, __m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_fmad8_avx512fnofma(__m512d, __m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sqrtd8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_sqrtd8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sqrtd8_u05avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_sqrtd8_u05avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_sqrtd8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_sqrtd8_u35avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_hypotd8_u05avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_hypotd8_u05avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_hypotd8_u35avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_hypotd8_u35avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fabsd8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_fabsd8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_copysignd8_avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_copysignd8_avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmaxd8_avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_fmaxd8_avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmind8_avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_fmind8_avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fdimd8_avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_fdimd8_avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_truncd8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_truncd8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_floord8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_floord8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_ceild8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_ceild8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_roundd8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_roundd8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_rintd8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_rintd8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_nextafterd8_avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_nextafterd8_avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_frfrexpd8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_frfrexpd8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m256i Sleef_expfrexpd8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m256i Sleef_cinz_expfrexpd8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_fmodd8_avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_fmodd8_avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_remainderd8_avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_remainderd8_avx512fnofma(__m512d, __m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_modfd8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512d_2 Sleef_cinz_modfd8_avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_lgammad8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_lgammad8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_tgammad8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_tgammad8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_erfd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_erfd8_u10avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_erfcd8_u15avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST __m512d Sleef_cinz_erfcd8_u15avx512fnofma(__m512d);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd8_avx512fnofma(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd8_avx512fnofma(int);
+
+#ifndef Sleef___m512_2_DEFINED
+typedef struct {
+  __m512 x, y;
+} Sleef___m512_2;
+#define Sleef___m512_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_sinf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cosf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_cosf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincosf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_cinz_sincosf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_tanf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_asinf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_asinf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_acosf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_acosf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atanf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_atanf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atan2f16_u35avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_atan2f16_u35avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_logf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_logf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cbrtf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_cbrtf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_sinf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cosf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_cosf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincosf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_cinz_sincosf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_tanf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_asinf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_asinf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_acosf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_acosf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atanf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_atanf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atan2f16_u10avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_atan2f16_u10avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_logf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_logf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cbrtf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_cbrtf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_expf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_expf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_powf16_u10avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_powf16_u10avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinhf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_sinhf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_coshf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_coshf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanhf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_tanhf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinhf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_sinhf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_coshf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_coshf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tanhf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_tanhf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fastsinf16_u3500avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_fastsinf16_u3500avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fastcosf16_u3500avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_fastcosf16_u3500avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fastpowf16_u3500avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_fastpowf16_u3500avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_asinhf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_asinhf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_acoshf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_acoshf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_atanhf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_atanhf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp2f16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_exp2f16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp2f16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_exp2f16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp10f16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_exp10f16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_exp10f16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_exp10f16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_expm1f16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_expm1f16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log10f16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_log10f16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log2f16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_log2f16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log2f16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_log2f16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_log1pf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_log1pf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincospif16_u05avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_cinz_sincospif16_u05avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_sincospif16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_cinz_sincospif16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sinpif16_u05avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_sinpif16_u05avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cospif16_u05avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_cospif16_u05avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fmaf16_avx512fnofma(__m512, __m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_fmaf16_avx512fnofma(__m512, __m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sqrtf16_avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_sqrtf16_avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sqrtf16_u05avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_sqrtf16_u05avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_sqrtf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_sqrtf16_u35avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_hypotf16_u05avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_hypotf16_u05avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_hypotf16_u35avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_hypotf16_u35avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fabsf16_avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_fabsf16_avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_copysignf16_avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_copysignf16_avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fmaxf16_avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_fmaxf16_avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fminf16_avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_fminf16_avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fdimf16_avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_fdimf16_avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_truncf16_avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_truncf16_avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_floorf16_avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_floorf16_avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_ceilf16_avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_ceilf16_avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_roundf16_avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_roundf16_avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_rintf16_avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_rintf16_avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_nextafterf16_avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_nextafterf16_avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_frfrexpf16_avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_frfrexpf16_avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_fmodf16_avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_fmodf16_avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_remainderf16_avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_remainderf16_avx512fnofma(__m512, __m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_modff16_avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST Sleef___m512_2 Sleef_cinz_modff16_avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_lgammaf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_lgammaf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_tgammaf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_tgammaf16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_erff16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_erff16_u10avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_erfcf16_u15avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST __m512 Sleef_cinz_erfcf16_u15avx512fnofma(__m512);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf16_avx512fnofma(int);
+SLEEF_IMPORT SLEEF_CONST int Sleef_cinz_getIntf16_avx512fnofma(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf16_avx512fnofma(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_cinz_getPtrf16_avx512fnofma(int);
+#endif
+#ifdef __STDC__
+
+#ifndef Sleef_double_2_DEFINED
+typedef Sleef_double2 Sleef_double_2;
+#define Sleef_double_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST double Sleef_sind1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_sind1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cosd1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_cosd1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincosd1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_cinz_sincosd1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_tand1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_tand1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_asind1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_asind1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_acosd1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_acosd1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_atand1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_atand1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_atan2d1_u35purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_atan2d1_u35purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_logd1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_logd1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cbrtd1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_cbrtd1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_sind1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_sind1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cosd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_cosd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincosd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_cinz_sincosd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_tand1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_tand1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_asind1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_asind1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_acosd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_acosd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_atand1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_atand1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_atan2d1_u10purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_atan2d1_u10purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_logd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_logd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cbrtd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_cbrtd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_expd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_expd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_powd1_u10purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_powd1_u10purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_sinhd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_sinhd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_coshd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_coshd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_tanhd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_tanhd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_sinhd1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_sinhd1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_coshd1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_coshd1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_tanhd1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_tanhd1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_fastsind1_u3500purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_fastsind1_u3500purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_fastcosd1_u3500purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_fastcosd1_u3500purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_fastpowd1_u3500purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_fastpowd1_u3500purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_asinhd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_asinhd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_acoshd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_acoshd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_atanhd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_atanhd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_exp2d1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_exp2d1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_exp2d1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_exp2d1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_exp10d1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_exp10d1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_exp10d1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_exp10d1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_expm1d1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_expm1d1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_log10d1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_log10d1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_log2d1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_log2d1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_log2d1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_log2d1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_log1pd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_log1pd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincospid1_u05purec(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_cinz_sincospid1_u05purec(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincospid1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_cinz_sincospid1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_sinpid1_u05purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_sinpid1_u05purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cospid1_u05purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_cospid1_u05purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_ldexpd1_purec(double, int32_t);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_ldexpd1_purec(double, int32_t);
+SLEEF_IMPORT SLEEF_CONST int32_t Sleef_ilogbd1_purec(double);
+SLEEF_IMPORT SLEEF_CONST int32_t Sleef_cinz_ilogbd1_purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_fmad1_purec(double, double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_fmad1_purec(double, double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_sqrtd1_purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_sqrtd1_purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_sqrtd1_u05purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_sqrtd1_u05purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_sqrtd1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_sqrtd1_u35purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_hypotd1_u05purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_hypotd1_u05purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_hypotd1_u35purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_hypotd1_u35purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_fabsd1_purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_fabsd1_purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_copysignd1_purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_copysignd1_purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_fmaxd1_purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_fmaxd1_purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_fmind1_purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_fmind1_purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_fdimd1_purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_fdimd1_purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_truncd1_purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_truncd1_purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_floord1_purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_floord1_purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_ceild1_purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_ceild1_purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_roundd1_purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_roundd1_purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_rintd1_purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_rintd1_purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_nextafterd1_purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_nextafterd1_purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_frfrexpd1_purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_frfrexpd1_purec(double);
+SLEEF_IMPORT SLEEF_CONST int32_t Sleef_expfrexpd1_purec(double);
+SLEEF_IMPORT SLEEF_CONST int32_t Sleef_cinz_expfrexpd1_purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_fmodd1_purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_fmodd1_purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_remainderd1_purec(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_remainderd1_purec(double, double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_modfd1_purec(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_cinz_modfd1_purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_lgammad1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_lgammad1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_tgammad1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_tgammad1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_erfd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_erfd1_u10purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_erfcd1_u15purec(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cinz_erfcd1_u15purec(double);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd1_purec(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd1_purec(int);
+
+#ifndef Sleef_float_2_DEFINED
+typedef Sleef_float2 Sleef_float_2;
+#define Sleef_float_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST float Sleef_sinf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_sinf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cosf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_cosf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincosf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_cinz_sincosf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_tanf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_tanf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_asinf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_asinf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_acosf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_acosf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_atanf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_atanf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_atan2f1_u35purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_atan2f1_u35purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_logf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_logf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cbrtf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_cbrtf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_sinf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_sinf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cosf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_cosf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincosf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_cinz_sincosf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_tanf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_tanf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_asinf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_asinf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_acosf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_acosf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_atanf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_atanf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_atan2f1_u10purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_atan2f1_u10purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_logf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_logf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cbrtf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_cbrtf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_expf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_expf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_powf1_u10purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_powf1_u10purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_sinhf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_sinhf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_coshf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_coshf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_tanhf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_tanhf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_sinhf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_sinhf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_coshf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_coshf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_tanhf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_tanhf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_fastsinf1_u3500purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_fastsinf1_u3500purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_fastcosf1_u3500purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_fastcosf1_u3500purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_fastpowf1_u3500purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_fastpowf1_u3500purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_asinhf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_asinhf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_acoshf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_acoshf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_atanhf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_atanhf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_exp2f1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_exp2f1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_exp2f1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_exp2f1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_exp10f1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_exp10f1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_exp10f1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_exp10f1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_expm1f1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_expm1f1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_log10f1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_log10f1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_log2f1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_log2f1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_log2f1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_log2f1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_log1pf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_log1pf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincospif1_u05purec(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_cinz_sincospif1_u05purec(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincospif1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_cinz_sincospif1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_sinpif1_u05purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_sinpif1_u05purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cospif1_u05purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_cospif1_u05purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_fmaf1_purec(float, float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_fmaf1_purec(float, float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf1_purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_sqrtf1_purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf1_u05purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_sqrtf1_u05purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_sqrtf1_u35purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_hypotf1_u05purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_hypotf1_u05purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_hypotf1_u35purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_hypotf1_u35purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_fabsf1_purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_fabsf1_purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_copysignf1_purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_copysignf1_purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_fmaxf1_purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_fmaxf1_purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_fminf1_purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_fminf1_purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_fdimf1_purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_fdimf1_purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_truncf1_purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_truncf1_purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_floorf1_purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_floorf1_purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_ceilf1_purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_ceilf1_purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_roundf1_purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_roundf1_purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_rintf1_purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_rintf1_purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_nextafterf1_purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_nextafterf1_purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_frfrexpf1_purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_frfrexpf1_purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_fmodf1_purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_fmodf1_purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_remainderf1_purec(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_remainderf1_purec(float, float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_modff1_purec(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_cinz_modff1_purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_lgammaf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_lgammaf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_tgammaf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_tgammaf1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_erff1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_erff1_u10purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_erfcf1_u15purec(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cinz_erfcf1_u15purec(float);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf1_purec(int);
+SLEEF_IMPORT SLEEF_CONST int Sleef_cinz_getIntf1_purec(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf1_purec(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_cinz_getPtrf1_purec(int);
+#endif
+#ifdef __STDC__
+
+#ifndef Sleef_double_2_DEFINED
+typedef Sleef_double2 Sleef_double_2;
+#define Sleef_double_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST double Sleef_sind1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_sind1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cosd1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_cosd1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincosd1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_finz_sincosd1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_tand1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_tand1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_asind1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_asind1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_acosd1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_acosd1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_atand1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_atand1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_atan2d1_u35purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_atan2d1_u35purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_logd1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_logd1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cbrtd1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_cbrtd1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_sind1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_sind1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cosd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_cosd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincosd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_finz_sincosd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_tand1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_tand1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_asind1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_asind1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_acosd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_acosd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_atand1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_atand1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_atan2d1_u10purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_atan2d1_u10purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_logd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_logd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cbrtd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_cbrtd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_expd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_expd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_powd1_u10purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_powd1_u10purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_sinhd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_sinhd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_coshd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_coshd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_tanhd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_tanhd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_sinhd1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_sinhd1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_coshd1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_coshd1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_tanhd1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_tanhd1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_fastsind1_u3500purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_fastsind1_u3500purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_fastcosd1_u3500purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_fastcosd1_u3500purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_fastpowd1_u3500purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_fastpowd1_u3500purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_asinhd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_asinhd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_acoshd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_acoshd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_atanhd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_atanhd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_exp2d1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_exp2d1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_exp2d1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_exp2d1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_exp10d1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_exp10d1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_exp10d1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_exp10d1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_expm1d1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_expm1d1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_log10d1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_log10d1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_log2d1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_log2d1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_log2d1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_log2d1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_log1pd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_log1pd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincospid1_u05purecfma(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_finz_sincospid1_u05purecfma(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincospid1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_finz_sincospid1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_sinpid1_u05purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_sinpid1_u05purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_cospid1_u05purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_cospid1_u05purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_ldexpd1_purecfma(double, int32_t);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_ldexpd1_purecfma(double, int32_t);
+SLEEF_IMPORT SLEEF_CONST int32_t Sleef_ilogbd1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST int32_t Sleef_finz_ilogbd1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_fmad1_purecfma(double, double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_fmad1_purecfma(double, double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_sqrtd1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_sqrtd1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_sqrtd1_u05purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_sqrtd1_u05purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_sqrtd1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_sqrtd1_u35purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_hypotd1_u05purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_hypotd1_u05purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_hypotd1_u35purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_hypotd1_u35purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_fabsd1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_fabsd1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_copysignd1_purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_copysignd1_purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_fmaxd1_purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_fmaxd1_purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_fmind1_purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_fmind1_purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_fdimd1_purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_fdimd1_purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_truncd1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_truncd1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_floord1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_floord1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_ceild1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_ceild1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_roundd1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_roundd1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_rintd1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_rintd1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_nextafterd1_purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_nextafterd1_purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_frfrexpd1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_frfrexpd1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST int32_t Sleef_expfrexpd1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST int32_t Sleef_finz_expfrexpd1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_fmodd1_purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_fmodd1_purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_remainderd1_purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_remainderd1_purecfma(double, double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_modfd1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_finz_modfd1_purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_lgammad1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_lgammad1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_tgammad1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_tgammad1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_erfd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_erfd1_u10purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_erfcd1_u15purecfma(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_finz_erfcd1_u15purecfma(double);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd1_purecfma(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd1_purecfma(int);
+
+#ifndef Sleef_float_2_DEFINED
+typedef Sleef_float2 Sleef_float_2;
+#define Sleef_float_2_DEFINED
+#endif
+
+SLEEF_IMPORT SLEEF_CONST float Sleef_sinf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_sinf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cosf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_cosf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincosf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_finz_sincosf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_tanf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_tanf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_asinf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_asinf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_acosf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_acosf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_atanf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_atanf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_atan2f1_u35purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_atan2f1_u35purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_logf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_logf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cbrtf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_cbrtf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_sinf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_sinf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cosf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_cosf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincosf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_finz_sincosf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_tanf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_tanf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_asinf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_asinf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_acosf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_acosf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_atanf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_atanf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_atan2f1_u10purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_atan2f1_u10purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_logf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_logf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cbrtf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_cbrtf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_expf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_expf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_powf1_u10purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_powf1_u10purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_sinhf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_sinhf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_coshf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_coshf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_tanhf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_tanhf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_sinhf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_sinhf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_coshf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_coshf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_tanhf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_tanhf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_fastsinf1_u3500purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_fastsinf1_u3500purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_fastcosf1_u3500purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_fastcosf1_u3500purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_fastpowf1_u3500purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_fastpowf1_u3500purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_asinhf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_asinhf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_acoshf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_acoshf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_atanhf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_atanhf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_exp2f1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_exp2f1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_exp2f1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_exp2f1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_exp10f1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_exp10f1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_exp10f1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_exp10f1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_expm1f1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_expm1f1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_log10f1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_log10f1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_log2f1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_log2f1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_log2f1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_log2f1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_log1pf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_log1pf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincospif1_u05purecfma(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_finz_sincospif1_u05purecfma(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincospif1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_finz_sincospif1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_sinpif1_u05purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_sinpif1_u05purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_cospif1_u05purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_cospif1_u05purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_fmaf1_purecfma(float, float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_fmaf1_purecfma(float, float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf1_purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_sqrtf1_purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf1_u05purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_sqrtf1_u05purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_sqrtf1_u35purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_hypotf1_u05purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_hypotf1_u05purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_hypotf1_u35purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_hypotf1_u35purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_fabsf1_purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_fabsf1_purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_copysignf1_purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_copysignf1_purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_fmaxf1_purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_fmaxf1_purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_fminf1_purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_fminf1_purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_fdimf1_purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_fdimf1_purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_truncf1_purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_truncf1_purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_floorf1_purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_floorf1_purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_ceilf1_purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_ceilf1_purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_roundf1_purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_roundf1_purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_rintf1_purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_rintf1_purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_nextafterf1_purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_nextafterf1_purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_frfrexpf1_purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_frfrexpf1_purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_fmodf1_purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_fmodf1_purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_remainderf1_purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_remainderf1_purecfma(float, float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_modff1_purecfma(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_finz_modff1_purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_lgammaf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_lgammaf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_tgammaf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_tgammaf1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_erff1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_erff1_u10purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_erfcf1_u15purecfma(float);
+SLEEF_IMPORT SLEEF_CONST float Sleef_finz_erfcf1_u15purecfma(float);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf1_purecfma(int);
+SLEEF_IMPORT SLEEF_CONST int Sleef_finz_getIntf1_purecfma(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf1_purecfma(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_finz_getPtrf1_purecfma(int);
+#endif
+#ifdef __STDC__
+
+#ifndef Sleef_double_2_DEFINED
+typedef Sleef_double2 Sleef_double_2;
+#define Sleef_double_2_DEFINED
+#endif
+
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sind1_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cosd1_u35(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincosd1_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_tand1_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_asind1_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_acosd1_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_atand1_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_atan2d1_u35(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_logd1_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cbrtd1_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sind1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cosd1_u10(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincosd1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_tand1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_asind1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_acosd1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_atand1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_atan2d1_u10(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_logd1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cbrtd1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_expd1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_powd1_u10(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sinhd1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_coshd1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_tanhd1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sinhd1_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_coshd1_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_tanhd1_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fastsind1_u3500(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fastcosd1_u3500(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fastpowd1_u3500(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_asinhd1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_acoshd1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_atanhd1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_exp2d1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_exp2d1_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_exp10d1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_exp10d1_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_expm1d1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_log10d1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_log2d1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_log2d1_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_log1pd1_u10(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincospid1_u05(double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_sincospid1_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sinpid1_u05(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cospid1_u05(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_ldexpd1(double, int32_t);
+SLEEF_IMPORT SLEEF_CONST int32_t Sleef_ilogbd1(double);
+SLEEF_IMPORT SLEEF_CONST double Sleef_fmad1(double, double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sqrtd1(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sqrtd1_u05(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sqrtd1_u35(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_hypotd1_u05(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_hypotd1_u35(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fabsd1(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_copysignd1(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fmaxd1(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fmind1(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fdimd1(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_truncd1(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_floord1(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_ceild1(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_roundd1(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_rintd1(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_nextafterd1(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_frfrexpd1(double);
+SLEEF_IMPORT SLEEF_CONST int32_t Sleef_expfrexpd1(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_fmodd1(double, double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_remainderd1(double, double);
+SLEEF_IMPORT SLEEF_CONST Sleef_double_2 Sleef_modfd1(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_lgammad1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_tgammad1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_erfd1_u10(double);
+SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_erfcd1_u15(double);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntd1(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrd1(int);
+
+#ifndef Sleef_float_2_DEFINED
+typedef Sleef_float2 Sleef_float_2;
+#define Sleef_float_2_DEFINED
+#endif
+
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sinf1_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_cosf1_u35(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincosf1_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_tanf1_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_asinf1_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_acosf1_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_atanf1_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_atan2f1_u35(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_logf1_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_cbrtf1_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sinf1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_cosf1_u10(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincosf1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_tanf1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_asinf1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_acosf1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_atanf1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_atan2f1_u10(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_logf1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_cbrtf1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_expf1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_powf1_u10(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sinhf1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_coshf1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_tanhf1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sinhf1_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_coshf1_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_tanhf1_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fastsinf1_u3500(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fastcosf1_u3500(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fastpowf1_u3500(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_asinhf1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_acoshf1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_atanhf1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_exp2f1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_exp2f1_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_exp10f1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_exp10f1_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_expm1f1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_log10f1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_log2f1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_log2f1_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_log1pf1_u10(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincospif1_u05(float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_sincospif1_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sinpif1_u05(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_cospif1_u05(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fmaf1(float, float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf1(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf1_u05(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_sqrtf1_u35(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_hypotf1_u05(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_hypotf1_u35(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fabsf1(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_copysignf1(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fmaxf1(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fminf1(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fdimf1(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_truncf1(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_floorf1(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_ceilf1(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_roundf1(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_rintf1(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_nextafterf1(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_frfrexpf1(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_fmodf1(float, float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_remainderf1(float, float);
+SLEEF_IMPORT SLEEF_CONST Sleef_float_2 Sleef_modff1(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_lgammaf1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_tgammaf1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_erff1_u10(float);
+SLEEF_PRAGMA_OMP_SIMD_SP SLEEF_IMPORT SLEEF_CONST float Sleef_erfcf1_u15(float);
+SLEEF_IMPORT SLEEF_CONST int Sleef_getIntf1(int);
+SLEEF_IMPORT SLEEF_CONST void *Sleef_getPtrf1(int);
+#endif
+
+//
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // #ifndef __SLEEF_H__
diff --git a/.venv/lib/python3.11/site-packages/torch/include/xnnpack.h b/.venv/lib/python3.11/site-packages/torch/include/xnnpack.h
new file mode 100644
index 0000000000000000000000000000000000000000..e71be0fd57ffc1ef2cc67b2fc8fb20fc4288a1d2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/xnnpack.h
@@ -0,0 +1,6172 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <pthreadpool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// The number of bytes XNNPACK may read beyond array bounds.
+/// The caller must allocate at least this many extra bytes after the tensor data passed to XNNPACK.
+///
+/// Note: XNNPACK reads, but never writes beyond array bounds.
+#define XNN_EXTRA_BYTES 16
+
+/// Maximum number of dimensions in tensor shape.
+#define XNN_MAX_TENSOR_DIMS 6
+
+/// Allow sparse inference in a Runtime.
+///
+/// Note: this flag hints XNNPACK to consider sparse inference, but does not guarantee it.
+#define XNN_FLAG_HINT_SPARSE_INFERENCE 0x00000001
+
+/// Allow IEEE FP16 inference in a Runtime.
+///
+/// Note: this flag hints XNNPACK to consider IEEE FP16 inference, but does not guarantee it.
+#define XNN_FLAG_HINT_FP16_INFERENCE 0x00000002
+
+/// Force IEEE FP16 inference in a Runtime, and fail if FP16 inference is not possible.
+///
+/// Note: this flag guarantees that XNNPACK will use IEEE FP16 inference, or fail to create the Runtime object.
+/// Warning: on x86 systems FP16 computations will be emulated at a substantial performance cost.
+#define XNN_FLAG_FORCE_FP16_INFERENCE 0x00000004
+
+/// Enable timing of each operator's runtime.
+#define XNN_FLAG_BASIC_PROFILING 0x00000008
+
+/// Enable the just-in-time compiler.
+#define XNN_FLAG_JIT 0x00000010
+
+/// The convolution operator represents a depthwise convolution, and use HWGo layout for filters.
+#define XNN_FLAG_DEPTHWISE_CONVOLUTION 0x00000001
+
+/// Assume transposed weights in a fully connected operator.
+#define XNN_FLAG_TRANSPOSE_WEIGHTS 0x00000001
+
+/// The operator assumes NHWC layout for the input, regardless of the output layout.
+#define XNN_FLAG_INPUT_NHWC 0x00000002
+
+/// Match "SAME" padding in TensorFlow. Exact padding values are computed dynamically depending on input size.
+#define XNN_FLAG_TENSORFLOW_SAME_PADDING 0x00000004
+
+/// Assume transposed weights in a batch matrix multiply operator.
+#define XNN_FLAG_TRANSPOSE_B XNN_FLAG_TRANSPOSE_WEIGHTS
+
+/// Assume transposed input in a batch matrix multiply operator.
+#define XNN_FLAG_TRANSPOSE_A 0x00000002
+
+/// Implicitly flatten and reshape input of a Fully Connected operator into a 2D tensor.
+#define XNN_FLAG_TENSORFLOW_RESHAPE_2D 0x00000004
+
+/// Match behaviour of TensorFlow 1.x.
+#define XNN_FLAG_TENSORFLOW_LEGACY_MODE 0x00000004
+
+/// Static weights of the FP16 operator are in FP32 format.
+#define XNN_FLAG_FP32_STATIC_WEIGHTS 0x00000008
+
+/// Align corners of input and output images in resize operations.
+#define XNN_FLAG_ALIGN_CORNERS 0x00000008
+
+/// Yield worker threads of the thread pool to the system scheduler after the inference.
+#define XNN_FLAG_YIELD_WORKERS 0x00000010
+
+/// Use transient indirection buffer to reduce memory footprint
+#define XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER 0x00000020
+
+/// Reduce the dimensions.
+#define XNN_FLAG_REDUCE_DIMS 0x00000040
+
+/// The number of entries in an array of xnn_dynamic_quantization_params that XNNPACK may read beyond array bounds.
+/// The caller must allocate at least this many extra xnn_dynamic_quantization_params before passing the array to XNNPACK.
+///
+/// Note: XNNPACK reads, but never writes beyond array bounds.
+#define XNN_EXTRA_QUANTIZATION_PARAMS 8
+
+struct xnn_dynamic_quantization_params {
+  int32_t zero_point;
+  float scale;
+};
+
+/// Status code for any XNNPACK function call.
+enum xnn_status {
+  /// The call succeeded, and all output arguments now contain valid data.
+  xnn_status_success = 0,
+  xnn_status_uninitialized = 1,
+  xnn_status_invalid_parameter = 2,
+  xnn_status_invalid_state = 3,
+  xnn_status_unsupported_parameter = 4,
+  xnn_status_unsupported_hardware = 5,
+  xnn_status_out_of_memory = 6,
+  xnn_status_reallocation_required = 7,
+};
+
+struct xnn_allocator {
+  /// User-specified pointer that will be passed as-is to all functions in this structure.
+  void* context;
+  /// Pointer to a function to be called for general memory allocation.
+  ///
+  /// @param context - The user-specified pointer from xnn_allocator structure.
+  /// @param size - The size of the memory block to allocate, in bytes.
+  ///
+  /// @returns Pointer to the allocated memory block of at least @ref size bytes.
+  ///          If allocation fails, the function must return NULL.
+  void* (*allocate)(void* context, size_t size);
+  /// Pointer to a function to be called for general memory re-allocation, i.e. to increase or shrink a previously
+  /// allocated memory block. The content of the old memory block is copied to the new memory block.
+  ///
+  /// @param context - The user-specified pointer from xnn_allocator structure.
+  /// @param pointer - Pointer to a memory block allocated by @ref allocate or @ref reallocate functions. Can be NULL.
+  ///                  If the pointer is NULL, the @ref reallocate call is equivalent to an @ref allocate call.
+  /// @param size - The new size of the memory block to allocate, in bytes.
+  ///
+  /// @returns Pointer to the newly allocated memory block of at least @ref size bytes with the content of the previous
+  ///          memory block.
+  ///          If allocation fails, the function must return NULL, but must not release the previous memory block.
+  void* (*reallocate)(void* context, void* pointer, size_t size);
+  /// Pointer to a function to be called for general memory de-allocation.
+  ///
+  /// @param context - The user-specified pointer from xnn_allocator structure.
+  /// @param pointer - Pointer to a memory block allocated by @ref allocate or @ref reallocate functions. Can be NULL.
+  ///                  If the pointer is NULL, the @ref deallocate call is a no-op.
+  void (*deallocate)(void* context, void* pointer);
+  /// Pointer to a function to be called for aligned memory allocation.
+  ///
+  /// @param context - The user-specified pointer from xnn_allocator structure.
+  /// @param alignment - The alignment of the memory block to allocate, in bytes. Alignment is always a power-of-2.
+  /// @param size - The size of the memory block to allocate, in bytes.
+  ///
+  /// @returns Pointer to the allocated memory block of at least @ref size bytes.
+  ///          If allocation fails, the function must return NULL.
+  void* (*aligned_allocate)(void* context, size_t alignment, size_t size);
+  /// Pointer to a function to be called for aligned memory de-allocation.
+  ///
+  /// @param context - The user-specified pointer from xnn_allocator structure.
+  /// @param pointer - Pointer to a memory block allocated by @ref aligned_allocate function. Can be NULL.
+  ///                  If the pointer is NULL, the @ref aligned_deallocate call is a no-op.
+  void (*aligned_deallocate)(void* context, void* pointer);
+};
+
+/// Initialize XNNPACK library.
+///
+/// XNNPACK must be successfully initialized before use. During initialization, XNNPACK populates internal structures
+/// depending on the host processor. Initialization can be time-consuming.
+///
+/// @param[in] allocator - structure with function pointers to be use for memory allocation and de-allocation.
+///                        If this argument is NULL, system-provided memory management functions (e.g. malloc/free)
+///                        will be used.
+///
+/// @retval xnn_status_success - XNNPACK is successfully initialized and ready to use.
+/// @retval xnn_status_out_of_memory - initialization failed due to out-of-memory condition.
+/// @retval xnn_status_unsupported_hardware - initialization failed because the host processor does not satisfy the
+///                                           minimum hardware requirements for XNNPACK. E.g. this may happen on x86
+///                                           processors without SSE2 extension, or on 32-bit ARM processors without
+///                                           the NEON SIMD extension.
+enum xnn_status xnn_initialize(const struct xnn_allocator* allocator);
+
+/// Deinitialize XNNPACK library.
+///
+/// To avoid memory and resource leaks, users must call xnn_deinitialize once for each successful xnn_initialize call.
+///
+/// @retval xnn_status_success - deinitialization call succeeded.
+enum xnn_status xnn_deinitialize(void);
+
+/// Subgraph is an abstract representation of a neural network model.
+/// Subgraph objects are used to define Values (tensors) and Nodes (operators) comprising the model.
+typedef struct xnn_subgraph* xnn_subgraph_t;
+
+/// Create a empty Subgraph object.
+///
+/// @param external_value_ids - number of Value IDs to reserve for communication with external graph representation.
+///                             The Subgraph object would avoid creating internal Value IDs in the
+///                             [0, reserved_value_ids-1] range.
+/// @param flags - binary features of the subgraph. No supported flags are currently defined.
+/// @param subgraph_out - pointer to the variable that will be initialized with a handle to the Subgraph object upon
+///                       successful return.
+enum xnn_status xnn_create_subgraph(
+  uint32_t external_value_ids,
+  uint32_t flags,
+  xnn_subgraph_t* subgraph_out);
+
+/// Destroy a Subgraph object, as well as Values, and Nodes associated with the subgraph.
+///
+/// @param subgraph - the Subgraph object to destroy.
+enum xnn_status xnn_delete_subgraph(
+  xnn_subgraph_t subgraph);
+
+#define XNN_VALUE_FLAG_EXTERNAL_INPUT  0x00000001
+#define XNN_VALUE_FLAG_EXTERNAL_OUTPUT 0x00000002
+#define XNN_VALUE_FLAG_PERSISTENT      0x00000004
+
+#define XNN_INVALID_VALUE_ID UINT32_MAX
+
+/// Type of elements in a Value object.
+enum xnn_datatype {
+  /// Invalid data type. Valid Values never have this datatype.
+  xnn_datatype_invalid = 0,
+  /// IEEE754 single-precision floating-point.
+  xnn_datatype_fp32 = 1,
+  /// IEEE754 half-precision floating-point.
+  xnn_datatype_fp16 = 2,
+  /// Quantized 8-bit signed integer with shared per-Value quantization parameters.
+  xnn_datatype_qint8 = 3,
+  /// Quantized 8-bit unsigned integer with shared per-Value quantization parameters.
+  xnn_datatype_quint8 = 4,
+  /// Quantized 32-bit signed integer with shared per-Value quantization parameters.
+  xnn_datatype_qint32 = 5,
+  /// Quantized 8-bit signed integer with shared per-channel quantization parameters.
+  xnn_datatype_qcint8 = 6,
+  /// Quantized 32-bit signed integer with shared per-channel quantization parameters.
+  xnn_datatype_qcint32 = 7,
+  /// Quantized 4-bit signed integer with shared per-channel quantization parameters.
+  xnn_datatype_qcint4 = 8,
+  /// Dynamically quantized 8-bit signed integer with per-batch quantization parameters.
+  xnn_datatype_qdint8 = 9,
+};
+
+/// Define a tensor-type Value and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Value.
+/// @param datatype - type of the tensor elements.
+/// @param num_dims - number of dimensions in the shape.
+/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
+///               XNNPACK does not keep any pointers to this array after the function returns.
+/// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized,
+///               this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time
+///               of the Subgraph object, and of any Runtime objects created from the Subgraph.
+/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
+///                      the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
+///                      created for the Value.
+/// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT
+///                and XNN_VALUE_FLAG_EXTERNAL_OUTPUT.
+/// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a
+///                 valid @a external_id was provided, the variable will be initialized with the @a external_id value.
+enum xnn_status xnn_define_tensor_value(
+  xnn_subgraph_t subgraph,
+  enum xnn_datatype datatype,
+  size_t num_dims,
+  const size_t* dims,
+  const void* data,
+  uint32_t external_id,
+  uint32_t flags,
+  uint32_t* id_out);
+
+/// Define a quantized tensor-type Value and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Value.
+/// @param datatype - type of the tensor elements.
+/// @param zero_point - offset from zero to subtract from the quantized elements in the Value.
+/// @param scale - multiplication factor to convert quantized elements to real representation.
+/// @param num_dims - number of dimensions in the shape.
+/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
+///               XNNPACK does not keep any pointers to this array after the function returns.
+/// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized,
+///               this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time
+///               of the Subgraph object, and of any Runtime objects created from the Subgraph.
+/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
+///                      the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
+///                      created for the Value.
+/// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT
+///                and XNN_VALUE_FLAG_EXTERNAL_OUTPUT.
+/// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a
+///                 valid @a external_id was provided, the variable will be initialized with the @a external_id value.
+enum xnn_status xnn_define_quantized_tensor_value(
+  xnn_subgraph_t subgraph,
+  enum xnn_datatype datatype,
+  int32_t zero_point,
+  float scale,
+  size_t num_dims,
+  const size_t* dims,
+  const void* data,
+  uint32_t external_id,
+  uint32_t flags,
+  uint32_t* id_out);
+
+enum xnn_status xnn_define_channelwise_quantized_tensor_value(
+  xnn_subgraph_t subgraph,
+  enum xnn_datatype datatype,
+  const float* scale,
+  size_t num_dims,
+  size_t channel_dim,
+  const size_t* dims,
+  const void* data,
+  uint32_t external_id,
+  uint32_t flags,
+  uint32_t* id_out);
+
+/// Validate the dimensions, channel_dim, zero point, datatype, and scale of a quantized tensor-type.
+///
+/// @param datatype - type of the tensor elements.
+/// @param zero_point - offset from zero to subtract from the quantized elements in the Value.
+/// @param scale - multiplication factor to convert quantized elements to real representation.
+/// @param num_dims - number of dimensions in the shape.
+/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
+///               XNNPACK does not keep any pointers to this array after the function returns.
+enum xnn_status xnn_validate_quantized_tensor(
+  enum xnn_datatype datatype,
+  int32_t zero_point,
+  float scale,
+  size_t num_dims,
+  const size_t* dims);
+
+/// Validate the dimensions, channel_dim, zero point, datatype, and scales of a channelwise quantized tensor-type.
+///
+/// @param datatype - type of the tensor elements.
+/// @param zero_point - offset from zero to subtract from the quantized elements in the Value.
+/// @param scale - per-channel multiplication factors to convert quantized elements to real representation.
+/// @param num_dims - number of dimensions in the shape.
+/// @param channel_dim - index of the channel dimension in the tensor with per-channel quantization parameters.
+///                      Typically this is the first dimension (dimension #0) of the filter tensors in the Convolution,
+///                      Deconvolution, and Fully Connected operators and the last dimension of the filter tensors in
+///                      the Depthwise Convolution operators.
+/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
+///               XNNPACK does not keep any pointers to this array after the function returns.
+enum xnn_status xnn_validate_channelwise_quantized_tensor(
+  enum xnn_datatype datatype,
+  int32_t zero_point,
+  const float* scale,
+  size_t num_dims,
+  size_t channel_dim,
+  const size_t* dims);
+
+/// Define a channelwise quantized tensor-type Value and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Value.
+/// @param datatype - type of the tensor elements.
+/// @param zero_point - offset from zero to subtract from the quantized elements in the Value.
+/// @param scale - per-channel multiplication factors to convert quantized elements to real representation.
+/// @param num_dims - number of dimensions in the shape.
+/// @param channel_dim - index of the channel dimension in the tensor with per-channel quantization parameters.
+///                      Typically this is the first dimension (dimension #0) of the filter tensors in the Convolution,
+///                      Deconvolution, and Fully Connected operators and the last dimension of the filter tensors in
+///                      the Depthwise Convolution operators.
+/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
+///               XNNPACK does not keep any pointers to this array after the function returns.
+/// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized,
+///               this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time
+///               of the Subgraph object, and of any Runtime objects created from the Subgraph.
+/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
+///                      the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
+///                      created for the Value.
+/// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT
+///                and XNN_VALUE_FLAG_EXTERNAL_OUTPUT.
+/// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a
+///                 valid @a external_id was provided, the variable will be initialized with the @a external_id value.
+enum xnn_status xnn_define_channelwise_quantized_tensor_value_v2(
+  xnn_subgraph_t subgraph,
+  enum xnn_datatype datatype,
+  int32_t zero_point,
+  const float* scale,
+  size_t num_dims,
+  size_t channel_dim,
+  const size_t* dims,
+  const void* data,
+  uint32_t external_id,
+  uint32_t flags,
+  uint32_t* id_out);
+
+/// Define a dynamically quantized tensor-type Value and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Value.
+/// @param datatype - type of the tensor elements.
+/// @param num_dims - number of dimensions in the shape.
+/// @param num_non_batch_dims - number of non-batch dimensions in the shape. The leading (num_dims - num_non_batch_dims)
+///                             dimensions will be flattened and treated as batch size. A set of quantization parameters
+///                             will be calculated for each batch element.
+/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
+///               XNNPACK does not keep any pointers to this array after the function returns.
+/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
+///                      the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
+///                      created for the Value.
+/// @param flags - binary features of the Value. No supported flags are currently defined.
+/// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a
+///                 valid @a external_id was provided, the variable will be initialized with the @a external_id value.
+enum xnn_status xnn_define_dynamically_quantized_tensor_value(
+  xnn_subgraph_t subgraph,
+  enum xnn_datatype datatype,
+  size_t num_dims,
+  size_t num_nonbatch_dims,
+  const size_t* dims,
+  uint32_t external_id,
+  uint32_t flags,
+  uint32_t* id_out);
+
+/// Define a Convert Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Convert Node. No supported flags are currently defined.
+enum xnn_status xnn_define_convert(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2D Convolution Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
+///                            flag is specified.
+/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
+///                              XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
+///                               XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
+///                             XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param kernel_height - kernel (filter) height.
+/// @param kernel_width - kernel (filter) width.
+/// @param subsampling_height - height of subsampling region for convolution output (convolution height stride).
+/// @param subsampling_width - width of subsampling region for convolution output (convolution width stride).
+/// @param dilation_height - dilation of kernel elements along the height dimension.
+/// @param dilation_width - dilation of kernel elements along the width dimension.
+/// @param groups - number of convolution groups.
+/// @param group_input_channels - number of input channels per group.
+/// @param group_output_channels - number of output channels per group.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [N, IH, IW, groups * group_input_channels] dimensions
+/// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph
+///                    with [groups * group_output_channels, kernel_height, kernel_width, group_input_channels]
+///                    dimensions.
+/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a 2D Convolution Node without a bias. If
+///                  present, the bias tensor must be a 1D tensor defined in the @a subgraph with [groups *
+///                  group_output_channels] dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [N, OH, OW, groups * group_output_channels] dimensions.
+/// @param flags - binary features of the 2D Convolution Node. The only currently supported values is
+///                XNN_FLAG_TENSORFLOW_SAME_PADDING.
+enum xnn_status xnn_define_convolution_2d(
+  xnn_subgraph_t subgraph,
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t subsampling_height,
+  uint32_t subsampling_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t filter_id,
+  uint32_t bias_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2D Deconvolution (Transposed Convolution) Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param padding_top - implicit padding above 2D output data.
+/// @param padding_right - implicit padding to the right of 2D output data.
+/// @param padding_bottom - implicit padding below 2D output data.
+/// @param padding_left - implicit padding to the left of 2D output data.
+/// @param adjustment_height - additional elements in the bottom of the 2D output data.
+/// @param adjustment_width - additional elements to the right of the 2D output data.
+/// @param kernel_height - kernel (filter) height.
+/// @param kernel_width - kernel (filter) width.
+/// @param upsampling_height - height of upsampling region for deconvolution input (deconvolution height stride).
+/// @param upsampling_width - width of upsampling region for deconvolution input (deconvolution width stride).
+/// @param dilation_height - dilation of kernel elements along the height dimension.
+/// @param dilation_width - dilation of kernel elements along the width dimension.
+/// @param groups - number of convolution groups.
+/// @param group_input_channels - number of input channels per group.
+/// @param group_output_channels - number of output channels per group.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [N, IH, IW, groups * group_input_channels] dimensions
+/// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph
+///                    with [groups * group_output_channels, kernel_height, kernel_width, group_input_channels]
+///                    dimensions.
+/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a 2D Convolution Node without a bias. If
+///                  present, the bias tensor must be a 1D tensor defined in the @a subgraph with
+///                  [groups * group_output_channels] dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [N, OH, OW, groups * group_output_channels] dimensions.
+/// @param flags - binary features of the 2D Deconvolution Node. No supported flags are currently defined.
+enum xnn_status xnn_define_deconvolution_2d(
+  xnn_subgraph_t subgraph,
+  uint32_t padding_top,
+  uint32_t padding_right,
+  uint32_t padding_bottom,
+  uint32_t padding_left,
+  uint32_t adjustment_height,
+  uint32_t adjustment_width,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t upsampling_height,
+  uint32_t upsampling_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t filter_id,
+  uint32_t bias_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2D Depthwise Convolution Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
+///                            flag is specified.
+/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
+///                              XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
+///                               XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
+///                             XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param kernel_height - kernel (filter) height.
+/// @param kernel_width - kernel (filter) width.
+/// @param subsampling_height - height of subsampling region for convolution output (convolution height stride).
+/// @param subsampling_width - width of subsampling region for convolution output (convolution width stride).
+/// @param dilation_height - dilation of kernel elements along the height dimension.
+/// @param dilation_width - dilation of kernel elements along the width dimension.
+/// @param depth_multiplier - ratio of output channels to input channels.
+/// @param input_channels - number of input channels.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [N, IH, IW, input_channels] dimensions
+/// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph
+///                    with [1, kernel_height, kernel_width, input_channels * depth_multiplier] dimensions.
+/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a 2D Depthwise Convolution Node without
+///                  a bias. If present, the bias tensor must be a 1D tensor defined in the @a subgraph with
+///                  [input_channels * depth_multiplier] dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [N, OH, OW, input_channels * depth_multiplier] dimensions.
+/// @param flags - binary features of the 2D Depthwise Convolution Node. The only currently supported values is
+///                XNN_FLAG_TENSORFLOW_SAME_PADDING.
+enum xnn_status xnn_define_depthwise_convolution_2d(
+  xnn_subgraph_t subgraph,
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t subsampling_height,
+  uint32_t subsampling_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t depth_multiplier,
+  size_t input_channels,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t filter_id,
+  uint32_t bias_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Depth To Space Node 2D and add it to a Subgraph.
+///
+/// The Depth To Space 2D Node rearranges data from depth into blocks of spatial data (a reverse transform to
+/// Space To Depth). For a given input pixel, an output square of pixels with side @a block_size is formed from values
+/// in the corresponding number of its channels. The output depth is therefore @a block_size x @a block_size times
+/// smaller than that of the input.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param block_size - the size of the spatial block.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [N, IH, IW, OC * block_size * block_size] dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [N, IH * block_size, IW * block_size, OC] dimensions.
+/// @param flags - binary features of the input_channels Node. No supported flags are currently defined.
+enum xnn_status xnn_define_depth_to_space_2d(
+  xnn_subgraph_t subgraph,
+  uint32_t block_size,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+enum xnn_status xnn_define_depth_to_space(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t block_size,
+  uint32_t flags);
+
+/// Define a 1D Global Average Pooling Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with 2 or more dimensions
+///                   defined in the @a subgraph. Averaging is performed across the second-innermost dimension.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor with 2 or more
+///                    dimensions defined in the @a subgraph.
+/// @param flags - binary features of the 1D Global Average Pooling Node. The only currently supported value is
+///                XNN_FLAG_REDUCE_DIMS.
+enum xnn_status xnn_define_global_average_pooling_1d(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2D Global Average Pooling Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with 3 or more dimensions
+///                   defined in the @a subgraph. Averaging is performed across the second- and third-innermost
+///                   dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor with 3 or more
+///                    dimensions defined in the @a subgraph.
+/// @param flags - binary features of the 2D Global Average Pooling Node. The only currently supported value is
+///                XNN_FLAG_REDUCE_DIMS.
+enum xnn_status xnn_define_global_average_pooling_2d(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 1D Global Sum Pooling Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with 2 or more dimensions
+///                   defined in the @a subgraph. Averaging is performed across the second-innermost dimension.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor with 2 or more
+///                    dimensions defined in the @a subgraph.
+/// @param flags - binary features of the 1D Global Sum Pooling Node. The only currently supported value is
+///                XNN_FLAG_REDUCE_DIMS.
+enum xnn_status xnn_define_global_sum_pooling_1d(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2D Global Sum Pooling Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with 3 or more dimensions
+///                   defined in the @a subgraph. Averaging is performed across the second- and third-innermost
+///                   dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor with 3 or more
+///                    dimensions defined in the @a subgraph.
+/// @param flags - binary features of the 2D Global Sum Pooling Node. The only currently supported value is
+///                XNN_FLAG_REDUCE_DIMS.
+enum xnn_status xnn_define_global_sum_pooling_2d(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2D Average Pooling Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
+///                            flag is specified.
+/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
+///                              XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
+///                               XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
+///                             XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param pooling_height - pooling (kernel) height.
+/// @param pooling_width - pooling (kernel) width.
+/// @param stride_height - displacing of the pooling window in the vertical dimension of the input pixels corresponding
+///                        to vertically adjacent output pixels.
+/// @param stride_width - displacing of the pooling window in the horizontal dimension of the input pixels corresponding
+///                        to horizontally adjacent output pixels.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [N, IH, IW, channels] dimensions
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [N, OH, OW, channels] dimensions.
+/// @param flags - binary features of the 2D Average Pooling Node. The only currently supported values is
+///                XNN_FLAG_TENSORFLOW_SAME_PADDING.
+enum xnn_status xnn_define_average_pooling_2d(
+  xnn_subgraph_t subgraph,
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Fully Connected Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the
+///                   @a subgraph. If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the input tensor must be at least
+///                   1D and its last dimension must match the last dimension of the filter tensor. In particular, if
+///                   input is a 2D tensor, it must have [batch_size, input_channels] dimensions.
+///                   If XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, the number of elements in the input tensor must be
+///                   divisible by the input_channels. The tensor will be first flattened into a 1D tensor of
+///                   [num_input_elements] dimensions, then reshaped into a 2D tensor of
+///                   [num_input_elements / input_channels, input_channels] dimensions where num_input_elements is the
+///                   total number of elements in the input tensor.
+/// @param filter_id - Value ID for the filter tensor. The filter tensor must a 2D tensor defined in the @a subgraph.
+///                    If the XNN_FLAG_TRANSPOSE_WEIGHTS flag is not specified, the filter tensor must have
+///                    [output_channels, input_channels] dimensions. If the XNN_FLAG_TRANSPOSE_WEIGHTS flag is
+///                    specified, the filter tensor must have [input_channels, output_channels] dimensions.
+/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a Fully Connected Node without a bias.
+///                  If present, the bias tensor must be a 1D tensor defined in the @a subgraph with [output_channels]
+///                  dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph.
+///                    If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the output tensor must have the same
+///                    dimensionality as the input tensor, all its dimensions but the last one must match the
+///                    corresponding dimensions of the input tensor, and the last dimensions of the output tensor must
+///                    match the first dimension of the filter tensor. In particular, if input is a 2D tensor, output
+///                    must be a 2D tensor of [batch_size, output_channels] dimensions.
+///                    If XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, output must be a 2D tensor of
+///                    [num_input_elements / input_channels, output_channels] dimensions where num_input_elements is the
+///                    total number of elements in the input tensor.
+/// @param flags - binary features of the Fully Connected Node. The only currently supported values are
+///                XNN_FLAG_TENSORFLOW_RESHAPE_2D and XNN_FLAG_TRANSPOSE_WEIGHTS.
+enum xnn_status xnn_define_fully_connected(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t filter_id,
+  uint32_t bias_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Sparse Fully Connected Node and add it to a Subgraph.
+///
+/// This operator is experimental, and will be removed in the future.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the
+///                   @a subgraph. If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the input tensor must be at least
+///                   1D and its last dimension must match the last dimension of the filter tensor. In particular, if
+///                   input is a 2D tensor, it must have [batch_size, input_channels] dimensions.
+///                   If XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, the number of elements in the input tensor must be
+///                   divisible by the input_channels. The tensor will be first flattened into a 1D tensor of
+///                   [num_input_elements] dimensions, then reshaped into a 2D tensor of
+///                   [num_input_elements / input_channels, input_channels] dimensions where num_input_elements is the
+///                   total number of elements in the input tensor.
+/// @param filter_id - Value ID for the filter tensor. The filter tensor must a 2D tensor defined in the @a subgraph.
+///                    If the XNN_FLAG_TRANSPOSE_WEIGHTS flag is not specified, the filter tensor must have
+///                    [output_channels, input_channels] dimensions. If the XNN_FLAG_TRANSPOSE_WEIGHTS flag is
+///                    specified, the filter tensor must have [input_channels, output_channels] dimensions.
+/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a Fully Connected Node without a bias.
+///                  If present, the bias tensor must be a 1D tensor defined in the @a subgraph with [output_channels]
+///                  dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph.
+///                    If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the output tensor must have the same
+///                    dimensionality as the input tensor, all its dimensions but the last one must match the
+///                    corresponding dimensions of the input tensor, and the last dimensions of the output tensor must
+///                    match the first dimension of the filter tensor. In particular, if input is a 2D tensor, output
+///                    must be a 2D tensor of [batch_size, output_channels] dimensions.
+///                    If XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, output must be a 2D tensor of
+///                    [num_input_elements / input_channels, output_channels] dimensions where num_input_elements is the
+///                    total number of elements in the input tensor.
+/// @param flags - binary features of the Fully Connected Node. The only currently supported values are
+///                XNN_FLAG_TENSORFLOW_RESHAPE_2D and XNN_FLAG_TRANSPOSE_WEIGHTS.
+enum xnn_status xnn_define_fully_connected_sparse(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t filter_id,
+  uint32_t bias_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2D Max Pooling Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
+///                            flag is specified.
+/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
+///                              XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
+///                               XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
+///                             XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param pooling_height - pooling (kernel) height.
+/// @param pooling_width - pooling (kernel) width.
+/// @param stride_height - displacing of the pooling window in the vertical dimension of the input pixels corresponding
+///                        to vertically adjacent output pixels.
+/// @param stride_width - displacing of the pooling window in the horizontal dimension of the input pixels corresponding
+///                        to horizontally adjacent output pixels.
+/// @param dilation_height - dilation of pooling elements along the height dimension.
+/// @param dilation_width - dilation of pooling elements along the width dimension.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [N, IH, IW, channels] dimensions
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [N, OH, OW, channels] dimensions.
+/// @param flags - binary features of the 2D Max Pooling Node. The only currently supported values is
+///                XNN_FLAG_TENSORFLOW_SAME_PADDING.
+enum xnn_status xnn_define_max_pooling_2d(
+  xnn_subgraph_t subgraph,
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2D ArgMax Pooling Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_padding_top - implicit zero-padding above 2D input data.
+/// @param input_padding_right - implicit zero-padding to the right of 2D input data.
+/// @param input_padding_bottom - implicit zero-padding below 2D input data.
+/// @param input_padding_left - implicit zero-padding to the left of 2D input data.
+/// @param pooling_height - pooling (kernel) height. Vertical stride between pooling regions match this value.
+/// @param pooling_width - pooling (kernel) width. Horizontal stride between pooling regions match this value.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [N, IH, IW, channels] dimensions
+/// @param output_value_id - Value ID for the output tensor with the maximum values in the pools. The output tensor must
+///                          be a 4D tensor defined in the @a subgraph with [N, OH, OW, channels] dimensions.
+/// @param output_index_id - Value ID for the output tensor with the indexes of the maximum values in the pools. The
+///                          output tensor must be a 4D tensor defined in the @a subgraph with [N, OH, OW, channels]
+///                          dimensions.
+/// @param flags - binary features of the 2D ArgMax Pooling Node. No supported flags are currently defined.
+enum xnn_status xnn_define_argmax_pooling_2d(
+  xnn_subgraph_t subgraph,
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t input_id,
+  uint32_t output_value_id,
+  uint32_t output_index_id,
+  uint32_t flags);
+
+/// Define a 2D UnPooling Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param padding_top - implicit padding above 2D output data.
+/// @param padding_right - implicit padding to the right of 2D output data.
+/// @param padding_bottom - implicit padding below 2D output data.
+/// @param padding_left - implicit padding to the left of 2D output data.
+/// @param pooling_height - height of the pooling window.
+/// @param pooling_width - width of the pooling window.
+/// @param input_value_id - Value ID for the input tensor with the max-pooling values to invert. The input value tensor
+///                         must be a 4D tensor defined in the @a subgraph with [N, IH, IW, channels] dimensions.
+/// @param input_index_id - Value ID for the input tensor with the indices of the per-pool maximum values produced by
+///                         a 2D UnPooling Node. The input tensor must be a 4D tensor defined in the @a subgraph with
+///                         [N, IH, IW, channels] dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [N, OH, OW, channels] dimensions.
+/// @param flags - binary features of the 2D UnPooling Node. No supported flags are currently defined.
+enum xnn_status xnn_define_unpooling_2d(
+  xnn_subgraph_t subgraph,
+  uint32_t padding_top,
+  uint32_t padding_right,
+  uint32_t padding_bottom,
+  uint32_t padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t input_value_id,
+  uint32_t input_index_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2-Input Add Node and add it to a Subgraph.
+///
+/// The 2-Input Add Node computes elementwise addition of two tensor inputs with numpy broadcasting rules.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
+///                    of the two inputs.
+/// @param flags - binary features of the Add Node. No supported flags are currently defined.
+enum xnn_status xnn_define_add2(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2-Input Multiply Node and add it to a Subgraph.
+///
+/// The 2-Input Multiply Node computes elementwise multiplication of two tensor inputs with numpy broadcasting rules.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
+///                    of the two inputs.
+/// @param flags - binary features of the Multiply Node. No supported flags are currently defined.
+enum xnn_status xnn_define_multiply2(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+// Cap operations applied to logits (Q * K) of attention operator.
+enum xnn_attention_logits_cap_type {
+  // No capping.
+  xnn_attention_logits_cap_type_none = 0,
+  // Cap the absolute values of logits by tanh: tanh(logits / cap) * cap
+  xnn_attention_logits_cap_type_tanh
+};
+
+// Params when the cap type is xnn_attention_logits_cap_type_tanh.
+struct xnn_attention_logits_cap_tanh_params {
+  float cap;
+};
+
+/// Define a Scaled Dot-Product Attention Node and add it to a Subgraph.
+///
+/// This operator is experimental.
+///
+/// The Scaled Dot-Product Attention Node computes a multi-head or multi-query scaled dot attention on the query, key,
+/// and value tensors.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param cap_type - type of cap to be applied to the logits.
+/// @param cap_params - parameters for the cap. Must be a pointer to xnn_attention_logits_cap_tanh_params if cap_type
+///                     is xnn_attention_logits_cap_type_tanh.
+/// @param query_id - Value ID for the query tensor. The query tensor must be a 3+-dimensional tensor defined in the
+///                   @a subgraph with the dimensions as [*, H, T, C], where H/T/C are the heads/tokens/channels, and *
+///                   is the 0 or more dimensions treated as batch size.
+/// @param key_id - Value ID for the key tensor. The key tensor must be a 2+--dimensional tensor defined in the
+///                 @a subgraph. It can have the same number of dimensions as the query, with the dimensions as
+///                 [*, H, U, C] (multi-head), or have 1 less dimension than the query, with the dimensions as
+///                 as [*, U, C] (multi-query, number of heads omitted implies single head), where H/U/C are the
+///                 heads/key_value_tokens/channels, and * is the 0 or more dimensions treated as batch size. These
+///                 batch size dimensions must be the same as query.
+/// @param value_id - Value ID for the value tensor. The value tensor must be a 2+--dimensional tensor defined in the
+///                   @a subgraph. It can have the same number of dimensions as the query, with the dimensions as
+///                   [*, H, U, D] (multi-head), or have 1 less dimension than the query, with the dimensions as
+///                   as [*, U, D] (multi-query, number of heads omitted implies single head), where H/U/D are the
+///                   heads/key_value_tokens/value_channels, and * is the 0 or more dimensions treated as batch size.
+///                   These batch size dimensions must be the same as query and key.
+/// @param scale_id - Value ID for the scale tensor. The scale tensor must be a 1D tensor defined in the @a subgraph
+///                   with [C] dimensions. The query tensor is multiplied with this scale tensor before the dot product
+///                   with the key tensor.
+/// @param mask_id - Value ID for the mask tensor. The mask tensor must be a 2D tensor defined in the @a subgraph with
+///                  [T, U] dimensions. The mask tensor is added to the logits (query dot value).
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 3+-dimensional tensor defined in the
+///                    @a subgraph with the dimensions as [*, H, T, D], where H/T/D are the heads/tokens/value_channels,
+///                    and * is the 0 or more dimensions treated as batch size. These batch size dimensions must be the
+///                    same as query, key, and value.
+/// @param flags - binary features of the Scaled Dot Product Attention Node. No supported flags are currently defined.
+enum xnn_status xnn_define_scaled_dot_product_attention(
+  xnn_subgraph_t subgraph,
+  enum xnn_attention_logits_cap_type cap_type,
+  const void* cap_params,
+  uint32_t query_id,
+  uint32_t key_id,
+  uint32_t value_id,
+  uint32_t scale_id,
+  uint32_t mask_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Subtract Node and add it to a Subgraph.
+///
+/// The Subtract Node computes elementwise subtraction of two tensor inputs with numpy broadcasting rules.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
+///                    of the two inputs.
+/// @param flags - binary features of the Subtract Node. No supported flags are currently defined.
+enum xnn_status xnn_define_subtract(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Divide Node and add it to a Subgraph.
+///
+/// The Divide Node computes elementwise division of two tensor inputs with numpy broadcasting rules.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
+///                    of the two inputs.
+/// @param flags - binary features of the Divide Node. No supported flags are currently defined.
+enum xnn_status xnn_define_divide(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2-Input Maximum Node and add it to a Subgraph.
+///
+/// The 2-Input Maximum Node computes elementwise maximum of two tensor inputs with numpy broadcasting rules.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
+///                    of the two inputs.
+/// @param flags - binary features of the Maximum Node. No supported flags are currently defined.
+enum xnn_status xnn_define_maximum2(
+  xnn_subgraph_t subgraph,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2-Input Minimum Node and add it to a Subgraph.
+///
+/// The 2-Input Minimum Node computes elementwise minimum of two tensor inputs with numpy broadcasting rules.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
+///                    of the two inputs.
+/// @param flags - binary features of the Minimum Node. No supported flags are currently defined.
+enum xnn_status xnn_define_minimum2(
+  xnn_subgraph_t subgraph,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Squared Difference Node and add it to a Subgraph.
+///
+/// The Squared Difference Node computes elementwise squared difference of two tensor inputs with numpy broadcasting
+/// rules.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
+///                    of the two inputs.
+/// @param flags - binary features of the Squared Difference Node. No supported flags are currently defined.
+enum xnn_status xnn_define_squared_difference(
+  xnn_subgraph_t subgraph,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Constant Pad Node with static padding specification and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param pre_paddings - number of padding elements to insert before input elements for every dimension. This array
+///                       must have as many elements as the number of dimensions in the input tensor.
+/// @param post_paddings - number of padding elements to insert after input elements for every dimension. This array
+///                        must have as many elements as the number of dimensions in the input tensor.
+/// @param padding_value - constant value used to initialize padding elements.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor with padding.
+/// @param flags - binary features of the Constant Pad Node. No supported flags are currently defined.
+enum xnn_status xnn_define_static_constant_pad(
+  xnn_subgraph_t subgraph,
+  const size_t* pre_paddings,
+  const size_t* post_paddings,
+  float padding_value,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Mean Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param num_reduction_axes - number of axes along which mean is computed.
+/// @param reduction_axes - axes along which mean is computed.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with at least
+///                   @a num_reduction_axes dimensions defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor defined in the
+///                    @a subgraph with @a num_reduction_axes fewer dimensions than the input tensor (if
+///                    XNN_FLAG_REDUCE_DIMS is specified), or has same dimension rank but the dimension at
+///                    @a reduction_axes reduced to 1 (if XNN_FLAG_REDUCE_DIMS is not specified).
+/// @param flags - binary features of the Mean Node. The only currently supported value is XNN_FLAG_REDUCE_DIMS
+enum xnn_status xnn_define_static_mean(
+  xnn_subgraph_t subgraph,
+  size_t num_reduction_axes,
+  const size_t* reduction_axes,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2-Input Concatenate Node and add it to a Subgraph.
+///
+/// The 2-Input Concatenate Node concatenates two tensors along a specified axis.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param axis - the axis to concatenate the two input tensors along
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
+///                    second input.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
+///                    first input.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to the dimension of both inputs, except the axis
+///                    dimension, where it is the sum of the corresponding dimensions of both inputs.
+/// @param flags - binary features of the Concatenate Node. No supported flags are currently defined.
+enum xnn_status xnn_define_concatenate2(
+  xnn_subgraph_t subgraph,
+  size_t axis,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 3-Input Concatenate Node and add it to a Subgraph.
+///
+/// The 3-Input Concatenate Node concatenates three tensors along a specified axis.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param axis - the axis to concatenate the three input tensors along
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
+///                    other inputs.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
+///                    other inputs.
+/// @param input3_id - Value ID for the third input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
+///                    other inputs.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to the dimension of all inputs, except the axis
+///                    dimension, where it is the sum of the corresponding dimensions of all inputs.
+/// @param flags - binary features of the Concatenate Node. No supported flags are currently defined.
+enum xnn_status xnn_define_concatenate3(
+  xnn_subgraph_t subgraph,
+  size_t axis,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t input3_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 4-Input Concatenate Node and add it to a Subgraph.
+///
+/// The 4-Input Concatenate Node concatenates four tensors along a specified axis.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param axis - the axis to concatenate the four input tensors along
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
+///                    other inputs.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
+///                    other inputs.
+/// @param input3_id - Value ID for the third input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
+///                    other inputs.
+/// @param input4_id - Value ID for the fourth input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
+///                    other inputs.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to the dimension of all inputs, except the axis
+///                    dimension, where it is the sum of the corresponding dimensions of all inputs.
+/// @param flags - binary features of the Concatenate Node. No supported flags are currently defined.
+enum xnn_status xnn_define_concatenate4(
+  xnn_subgraph_t subgraph,
+  size_t axis,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t input3_id,
+  uint32_t input4_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+enum xnn_status xnn_define_concatenate5(
+  xnn_subgraph_t subgraph,
+  size_t axis,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t input3_id,
+  uint32_t input4_id,
+  uint32_t input5_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Copy Node and add it to a Subgraph.
+///
+/// The Copy Node copies an input tensor to an output tensor.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the first input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Copy Node. No supported flags are currently defined.
+enum xnn_status xnn_define_copy(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2-Output Split Node and add it to a Subgraph.
+///
+/// The 2-Output Split Node splits an input tensor into two output tensors along a specified axis evenly.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param split_dim - the dimension to split the input tensor along
+/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the @a
+///                   subgraph.
+/// @param output1_id - Value ID for the first output tensor. The output tensor must be an N-dimensional tensor defined
+///                     in the @a subgraph with each dimension, except the axis, equal to the corresponding dimension
+///                     of the second output. The split_dim dimension is half of the input's split_dim.
+/// @param output2_id - Value ID for the second output tensor. The output tensor must be an N-dimensional tensor
+///                     defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
+///                     dimension of the first output. The split_dim dimension is half of the input's split_dim.
+/// @param flags - binary features of the Split Node. No supported flags are currently defined.
+enum xnn_status xnn_define_even_split2(
+  xnn_subgraph_t subgraph,
+  size_t split_dim,
+  uint32_t input_id,
+  uint32_t output1_id,
+  uint32_t output2_id,
+  uint32_t flags);
+
+/// Define a 3-Output Split Node and add it to a Subgraph.
+///
+/// The 3-Output Split Node splits an input tensor into three output tensors along a specified axis evenly.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param split_dim - the dimension to split the input tensor along
+/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the @a
+///                   subgraph.
+/// @param output1_id - Value ID for the first output tensor. The output tensor must be an N-dimensional tensor defined
+///                     in the @a subgraph with each dimension, except the axis, equal to the corresponding dimension
+///                     of the second and third output. The split_dim dimension is one third of the input's split_dim.
+/// @param output2_id - Value ID for the second output tensor. The output tensor must be an N-dimensional tensor
+///                     defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
+///                     dimension of the first and third output. The split_dim dimension is one third of the input's
+///                     split_dim.
+/// @param output3_id - Value ID for the third output tensor. The output tensor must be an N-dimensional tensor
+///                     defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
+///                     dimension of the second and third output. The split_dim dimension is one third of the input's
+///                     split_dim.
+/// @param flags - binary features of the Split Node. No supported flags are currently defined.
+enum xnn_status xnn_define_even_split3(
+  xnn_subgraph_t subgraph,
+  size_t split_dim,
+  uint32_t input_id,
+  uint32_t output1_id,
+  uint32_t output2_id,
+  uint32_t output3_id,
+  uint32_t flags);
+
+/// Define a 4-Output Split Node and add it to a Subgraph.
+///
+/// The 4-Output Split Node splits an input tensor into four output tensors along a specified axis evenly.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param split_dim - the dimension to split the input tensor along
+/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the @a
+///                   subgraph.
+/// @param output1_id - Value ID for the first output tensor. The output tensor must be an N-dimensional tensor defined
+///                     in the @a subgraph with each dimension, except the axis, equal to the corresponding dimension
+///                     of the other output tensors. The split_dim dimension is one fourth of the input's split_dim.
+/// @param output2_id - Value ID for the second output tensor. The output tensor must be an N-dimensional tensor
+///                     defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
+///                     dimension of the other output tensors. The split_dim dimension is one fourth of the input's
+///                     split_dim.
+/// @param output3_id - Value ID for the third output tensor. The output tensor must be an N-dimensional tensor
+///                     defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
+///                     dimension of the other output tensors. The split_dim dimension is one fourth of the input's
+///                     split_dim.
+/// @param output4_id - Value ID for the fourth output tensor. The output tensor must be an N-dimensional tensor
+///                     defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
+///                     dimension of the other output tensors. The split_dim dimension is one fourth of the input's
+///                     split_dim.
+/// @param flags - binary features of the Split Node. No supported flags are currently defined.
+enum xnn_status xnn_define_even_split4(
+  xnn_subgraph_t subgraph,
+  size_t split_dim,
+  uint32_t input_id,
+  uint32_t output1_id,
+  uint32_t output2_id,
+  uint32_t output3_id,
+  uint32_t output4_id,
+  uint32_t flags);
+
+/// Define a Reshape Node with static shape specification and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param num_dims - number of shape dimensions in the output tensor.
+/// @param new_shape - shape dimensions of the output tensor.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor with padding.
+/// @param flags - binary features of the Reshape Node. No supported flags are currently defined.
+enum xnn_status xnn_define_static_reshape(
+  xnn_subgraph_t subgraph,
+  size_t num_dims,
+  const size_t* new_shape,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Node that reshapes a tensor to two dimensions, retaining the
+/// trailing dimension, and add it to a Subgraph.
+///
+/// This operator is experimental.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be
+///                   defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be
+///                    defined in the @a subgraph, and its
+///                    size must match the shape of the input tensor with
+///                    padding.
+/// @param flags - binary features of the Reshape Node. No supported flags are
+///                currently defined.
+enum xnn_status xnn_define_reshape_2d(xnn_subgraph_t subgraph,
+                                      uint32_t input_id, uint32_t output_id,
+                                      uint32_t flags);
+
+/// Define a 2D Resize Bilinear Node with static output height & width specification and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param new_height - height dimension of the output tensor.
+/// @param new_width - width dimension of the output tensor.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [N, H, W, C] dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [N, new_height, new_width, C] dimensions.
+/// @param flags - binary features of the 2D Resize Bilinear Node. The only currently supported values are
+///                XNN_FLAG_TENSORFLOW_LEGACY_MODE and XNN_FLAG_ALIGN_CORNERS, which are mutually exclusive.
+enum xnn_status xnn_define_static_resize_bilinear_2d(
+  xnn_subgraph_t subgraph,
+  size_t new_height,
+  size_t new_width,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a PReLU (Parametric ReLU) Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [N, H, W, channels] dimensions.
+/// @param slope_id - Value ID for the slope tensor. The slope tensor must be a 1D tensor defined in the @a subgraph with
+///                   [channels] dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [N, H, W, channels] dimensions.
+/// @param flags - binary features of the PReLU Node. No supported flags are currently defined.
+enum xnn_status xnn_define_prelu(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t slope_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a RoPE (Rotary Positional Embeddings) Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param max_tokens - maximum possible number of tokens (maximum sequence length) of the input/output tensors.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [batch, tokens, heads, channels] dimensions.
+/// @param weights_id - Value ID for the weights tensor. The weights tensor must be a 2D tensor defined in the
+///                     @a subgraph with [max_tokens, channels] dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [batch, tokens, heads, channels] dimensions.
+/// @param flags - binary features of the RoPE Node. No supported flags are currently defined.
+enum xnn_status xnn_define_rope(
+  xnn_subgraph_t subgraph,
+  size_t max_sequence_size,
+  uint32_t input_id,
+  uint32_t weights_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Abs Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Abs Node. No supported flags are currently defined.
+enum xnn_status xnn_define_abs(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Bankers' Rounding Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Bankers' Rounding Node. No supported flags are currently defined.
+enum xnn_status xnn_define_bankers_rounding(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Batch Matrix Multiply Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph. It must be at least 3D. The first N-2 dimensions must match the second input
+///                    tensor. The last 2 dimensions are [M, K]. If XNN_FLAG_TRANSPOSE_B is not specified, the last
+///                    dimension must match the second last dimension of the second input tensor. If
+///                    XNN_FLAG_TRANSPOSE_B is specified, the last dimension must match the last dimension of the
+///                    second input tensor.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined
+///                    in the @a subgraph. It must be at least 3D. The first N-2 dimensions must match the first input
+///                    tensor. If XNN_FLAG_TRANSPOSE_B is not specified, the last 2 dimensions are [K, N], and the
+///                    second last dimension must match the last dimension of the first input tensor. If
+///                    XNN_FLAG_TRANSPOSE_B is specified, the last 2 dimensions are [N, K], and the last dimension must
+///                    match the last dimension of the first input tensor.
+/// @param output_id - Value ID for the output tensor. The output tensor must be an N-dimensional tensor defined in the
+///                    @a subgraph. It must be at least 3D. The first N-2 dimensions must match the first and second
+///                    input tensors . The last 2 dimensions must be [M, N].
+/// @param flags - binary features of the Batch Matrix Multiply Node. The only currently supported value is
+///                XNN_FLAG_TRANSPOSE_B.
+enum xnn_status xnn_define_batch_matrix_multiply(
+  xnn_subgraph_t subgraph,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Ceiling Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Ceiling Node. No supported flags are currently defined.
+enum xnn_status xnn_define_ceiling(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Clamp Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Clamp Node. No supported flags are currently defined.
+enum xnn_status xnn_define_clamp(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define an ELU (Exponential Linear Unit) Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param alpha - scale factor for negative output elements.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the ELU Node. No supported flags are currently defined.
+enum xnn_status xnn_define_elu(
+  xnn_subgraph_t subgraph,
+  float alpha,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Floor Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Floor Node. No supported flags are currently defined.
+enum xnn_status xnn_define_floor(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a HardSwish Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the HardSwish Node. No supported flags are currently defined.
+enum xnn_status xnn_define_hardswish(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Leaky ReLU Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param negative_slope - scale factor for negative input elements.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Leaky ReLU Node. No supported flags are currently defined.
+enum xnn_status xnn_define_leaky_relu(
+  xnn_subgraph_t subgraph,
+  float negative_slope,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Negate Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Negate Node. No supported flags are currently defined.
+enum xnn_status xnn_define_negate(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Sigmoid Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Sigmoid Node. No supported flags are currently defined.
+enum xnn_status xnn_define_sigmoid(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a SoftMax Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph, and have at
+///                   least one dimension.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the SoftMax Node. No supported flags are currently defined.
+enum xnn_status xnn_define_softmax(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Space To Depth 2D Node and add it to a Subgraph.
+///
+/// The Space To Depth 2D Node rearranges blocks of spatial data into blocks (a reverse transform to Depth To Space 2D).
+/// For a given input pixel, an output square of pixels with side @a block_size is formed from values in the
+/// corresponding number of its channels. The output depth is therefore @a block_size x @a block_size times greater
+/// than that of the input.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param block_size - the size of the spatial block.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [N, IH * block_size, IW * block_size, OC] dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [N, IH, IW, OC * block_size * block_size] dimensions.
+/// @param flags - binary features of the input_channels Node. No supported flags are currently defined.
+enum xnn_status xnn_define_space_to_depth_2d(
+  xnn_subgraph_t subgraph,
+  uint32_t block_size,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Square Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Square Node. No supported flags are currently defined.
+enum xnn_status xnn_define_square(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Square Root Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Square Root Node. No supported flags are currently defined.
+enum xnn_status xnn_define_square_root(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Reciprocal Square Root Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be
+/// defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be
+/// defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Square Root Node. No supported flags
+/// are currently defined.
+enum xnn_status xnn_define_reciprocal_square_root(xnn_subgraph_t subgraph,
+                                                  uint32_t input_id,
+                                                  uint32_t output_id,
+                                                  uint32_t flags);
+
+/// Define a Static Slice Node add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param num_dims - number of shape dimensions in the input and output tensor.
+/// @param offsets - offsets in each dimension of the input tensor. This array must have @a num_dims elements.
+/// @param sizes - size of each dimension in output tensor. This array must have @a num_dims elements.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    dimensions must match @a sizes.
+/// @param flags - binary features of the Static Slice Node. No supported flags are currently defined.
+enum xnn_status xnn_define_static_slice(
+  xnn_subgraph_t subgraph,
+  size_t num_dims,
+  const size_t* offsets,
+  const size_t* sizes,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Static Transpose Node and add it to a Subgraph.
+///
+/// The Static Transpose Node applies a generalized transpose to the input tensor using the permuation in perm.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in
+///                   the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be an N-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to its corresponding permuted input dimension.
+/// @param num_dims - the number of permutation dimensions. This must be equal to the number of input dimensions.
+/// @param perm - The permutation of the axis of the input tensor. The perm array must must contain 0 to N-1 in the
+///               permuted order.
+/// @param flags - binary features of the Static Transpose Node. No supported flags are currently defined.
+enum xnn_status xnn_define_static_transpose(
+  xnn_subgraph_t subgraph,
+  size_t num_dims,
+  const size_t* perm,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Tanh Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Tanh Node. No supported flags are currently defined.
+enum xnn_status xnn_define_tanh(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Code cache is a cache for JIT generated code.
+typedef struct xnn_code_cache* xnn_code_cache_t;
+
+/// Weights cache can be finalized in these ways:
+enum xnn_weights_cache_finalization_kind {
+  /// Weights cache is finalized, no insert operations into the weights cache is allowed, even if the "inserted"
+  /// weights already exist in thee cache. Weights cache memory will also be trimmed to page boundary and set to
+  /// read-only (to prevent writes).
+  xnn_weights_cache_finalization_kind_hard,
+  /// Weights cache will be finalized with some extra space at the end, this allows for "inserting" into the cache only
+  /// if the weights are already in the cache, and errors on inserting uncached weights. There is memory overhead.
+  xnn_weights_cache_finalization_kind_soft,
+};
+
+/// A combination of multiple factors to uniquely locate the weights cache.
+struct xnn_weights_cache_look_up_key {
+  /// The unique seed for each ukernel. It is guaranteed that each ukernel provides
+  /// a consistent and identical seed.
+  uint32_t seed;
+  /// Pointer to the original kernel.
+  const void* kernel;
+  /// Pointer to the original bias, could be NULL.
+  const void* bias;
+};
+
+/// A group of function pointers to manage weights cache. All functions may be
+/// called on multi threads.
+struct xnn_weights_cache_provider {
+  /// User-specified pointer that will be passed as-is to all functions in this
+  /// structure.
+  void* context;
+
+  /// Looks up the tuple of {cache_key, kernel, bias} in the cache. If it is found,
+  /// returns the offset to the found entry for reuse. Otherwise, returns SIZE_MAX.
+  /// @param context - The user-specified pointer from xnn_weights_cache_provider structure.
+  /// @param cache_key - The key used to locate the weights cache entry.
+  size_t (*look_up)(void* context, const struct xnn_weights_cache_look_up_key* cache_key);
+
+  /// Ensures that cache has enough space for `n` bytes. Returns the address to
+  /// store weight cache. Returns NULL if fails to reserve space.
+  /// @param context - The user-specified pointer from xnn_weights_cache_provider structure.
+  /// @param n - size to be reserved.
+  void* (*reserve_space)(void* context, size_t n);
+
+  /// Looks up packed weights at `ptr` in the cache. If it is found, reuse it.
+  /// Otherwise, it is added to the cache. Returns the offset to the cache.
+  /// @param context - The user-specified pointer from xnn_weights_cache_provider structure.
+  /// @param cache_key - The key used to locate the weights cache entry.
+  /// @param ptr - pointer pointing to the packed weight.
+  /// @param size - size of the packed weight.
+  size_t (*look_up_or_insert)(void* context, const struct xnn_weights_cache_look_up_key* cache_key, void* ptr, size_t size);
+
+  /// Returns whether the cache is finalized.
+  /// @param context - The user-specified pointer from xnn_weights_cache_provider structure.
+  bool (*is_finalized)(void* context);
+
+  /// Returns the absolute pointer corresponding to `offset`, where the offset is returned from
+  /// `look_up` or `get_or_insert`. This function must be called after finalize.
+  /// @param context - The user-specified pointer from xnn_weights_cache_provider structure.
+  /// @param offset - offset to the start of internal buffer
+  void* (*offset_to_addr)(void* context, size_t offset);
+
+  /// Destroy a weights cache object, as well as memory used for the cache.
+  /// @param context - The user-specified pointer from xnn_weights_cache_provider structure.
+  enum xnn_status (*delete_cache)(void* context);
+};
+
+/// Weights cache is a cache for packed weights. It can be reused between runtimes.
+typedef struct xnn_weights_cache_provider* xnn_weights_cache_t;
+
+/// Create a weights cache object specifying the initial size of weights cache (in bytes).
+///
+/// @param[in] size - initial capacity of the weights cache (in bytes), i.e. it can hold size bytes without growing.
+/// @param weights_cache_out - pointer to the variable that will be initialized to a handle to the weights cache provider
+///                            upon successful return. Once created, the weights cache provider can be shared between
+///                            different Runtime objects.
+enum xnn_status xnn_create_weights_cache_with_size(size_t size, xnn_weights_cache_t* weights_cache_out);
+
+enum xnn_status xnn_create_weights_cache(xnn_weights_cache_t* weights_cache_out);
+
+/// Finalizes the weights cache. The kind of finalization is specified by `finalization_kind`.
+/// @param weights_cache - the weights cache object to finalize.
+/// @param finalization_kind - the kind of finalization.
+enum xnn_status xnn_finalize_weights_cache(
+  xnn_weights_cache_t weights_cache,
+  enum xnn_weights_cache_finalization_kind finalization_kind);
+
+/// Destroy a weights cache object, as well as memory used for the cache.
+/// @param weights_cache - the weights cache object to destroy.
+enum xnn_status xnn_delete_weights_cache(xnn_weights_cache_t weights_cache);
+
+typedef struct xnn_workspace* xnn_workspace_t;
+
+/// Create a workspace object.
+/// @param workspace_out - pointer to the variable that will be initialized to a handle to the workspace object upon
+///                        successful return. Once created, the workspace can be shared between different Runtime
+///                        objects.
+enum xnn_status xnn_create_workspace(xnn_workspace_t* workspace_out);
+/// Destroy a workspace object, as well as memory used by the workspace. Object destruction can be deferred until all
+/// Runtime objects created with this workspace are destroyed.
+/// @param workspace - the workspace object to destroy.
+enum xnn_status xnn_release_workspace(xnn_workspace_t workspace);
+
+/// Runtime is a combination of an execution plan for subgraph Nodes and a memory manager for subgraph Values.
+typedef struct xnn_runtime* xnn_runtime_t;
+
+enum xnn_profile_info {
+  /// Returns a size_t containing the number of operators.
+  xnn_profile_info_num_operators,
+  /// Returns a char[] containing the null character separated names of all operators.
+  xnn_profile_info_operator_name,
+  /// Returns a uint64_t[] with the runtimes of all operators in the same order as xnn_profile_info_operator_name.
+  xnn_profile_info_operator_timing,
+};
+
+/// Return profile information for all operators.
+///
+/// @param runtime - a Runtime object created with @ref xnn_create_runtime, @ref xnn_create_runtime_v2 or
+///                  @ref xnn_create_runtime_v3.
+/// @param param_name - type of profile information required.
+/// @param param_value_size - the size in bytes of memory pointed to by param_value. If this is not sufficient then
+///                           param_value_size_ret will be set to the required size and xnn_status_out_of_memory will be
+///                           returned.
+/// @param param_value - a pointer to memory location where appropriate values for a given param_value will be written.
+/// @param param_value_size_ret - returns number of bytes required to write the result if param_value_size is not
+///                               sufficient.
+enum xnn_status xnn_get_runtime_profiling_info(xnn_runtime_t runtime,
+                                               enum xnn_profile_info param_name,
+                                               size_t param_value_size,
+                                               void* param_value,
+                                               size_t* param_value_size_ret);
+
+/// Create a Runtime object from a subgraph.
+///
+/// @param subgraph - a Subgraph object with all Values and Nodes that would be handled by the runtime. No Values or
+///                   Nodes can be added to the runtime once it is constructed.
+/// @param weights_cache - a cache for packed weights. The runtime will look up and reuse packed weights in this cache,
+///                        this will reduce memory allocated for packed weights.
+/// @param workspace - a workspace to hold internal tensors. The runtime will allocate space used for internal tensors
+///                    and track them using workspace. Workspace can be shared and reused across different runtimes. If
+///                    workspace is NULL, there will be no sharing: each runtime has its own workspace.
+/// @param threadpool - the thread pool to be used for parallelisation of computations in the runtime. If the thread
+///                     pool is NULL, the computation would run on the caller thread without parallelization.
+/// @param flags - binary features of the runtime. The only currently supported values are
+///                XNN_FLAG_HINT_SPARSE_INFERENCE, XNN_FLAG_HINT_FP16_INFERENCE, XNN_FLAG_FORCE_FP16_INFERENCE,
+///                XNN_FLAG_YIELD_WORKERS, and XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER. If XNN_FLAG_YIELD_WORKERS is
+///                specified, worker threads would be yielded to the system scheduler after processing the last operator
+///                in the Runtime. If XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER is specified, convolution operators will
+///                initialize indirection buffers on each inference run using temporary memory in the workspace, instead
+///                of initializing persistent indirection buffers once.
+/// @param runtime_out - pointer to the variable that will be initialized with a handle to the Runtime object upon
+///                      successful return. Once constructed, the Runtime object is independent of the Subgraph object
+///                      used to create it.
+enum xnn_status xnn_create_runtime_v4(
+  xnn_subgraph_t subgraph,
+  xnn_weights_cache_t weights_cache,
+  xnn_workspace_t workspace,
+  pthreadpool_t threadpool,
+  uint32_t flags,
+  xnn_runtime_t* runtime_out);
+
+enum xnn_status xnn_create_runtime_v3(
+  xnn_subgraph_t subgraph,
+  xnn_weights_cache_t weights_cache,
+  pthreadpool_t threadpool,
+  uint32_t flags,
+  xnn_runtime_t* runtime_out);
+
+enum xnn_status xnn_create_runtime_v2(
+  xnn_subgraph_t subgraph,
+  pthreadpool_t threadpool,
+  uint32_t flags,
+  xnn_runtime_t* runtime_out);
+
+enum xnn_status xnn_create_runtime(
+  xnn_subgraph_t subgraph,
+  xnn_runtime_t* runtime_out);
+
+struct xnn_external_value {
+  uint32_t id;
+  void* data;
+};
+
+/// Reshape an external value.
+///
+/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
+///                      the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
+///                      created for the Value.
+/// @param num_dims - number of dimensions in the shape.
+/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
+///               XNNPACK does not keep any pointers to this array after the function returns.
+enum xnn_status xnn_reshape_external_value(
+  xnn_runtime_t runtime,
+  uint32_t external_id,
+  size_t num_dims,
+  const size_t* dims);
+
+/// Get the external value shape.
+///
+/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
+///                      the Subgraph creation. The external ID can not be XNN_INVALID_VALUE_ID.
+/// @param num_dims -  A valid pointer into which the number of dimensions in the shape will be written. It can not be larger than XNN_MAX_TENSOR_DIMS.
+/// @param dims - pointer to an array of @a num_dims shape dimensions. This pointer can't be NULL. It must be large enough to hold
+///               at least @a num_dims elements. XNNPACK does not keep any pointers to this array after the function returns.
+enum xnn_status xnn_get_external_value_shape(
+  xnn_runtime_t runtime,
+  uint32_t external_id,
+  size_t* num_dims,
+  size_t* dims);
+
+/// Reshape the XNNPACK runtime.
+///
+/// Propgates the shapes of input tensors through the graph to determine the shapes of intermediate and output tensors.
+/// Memory is allocated if required. Output tensor shapes are returned by xnn_get_external_value_shape.
+///
+/// @param runtime - a Runtime object created with @ref xnn_create_runtime or @ref xnn_create_runtime_v2.
+enum xnn_status xnn_reshape_runtime(
+  xnn_runtime_t runtime);
+
+/// Deprecated. Use xnn_reshape_runtime and xnn_setup_runtime_v2.
+///
+/// Setup data pointers for external inputs and outputs in a Runtime object and
+/// allocate memory.
+///
+/// @param runtime - a Runtime object created with @ref xnn_create_runtime or @ref xnn_create_runtime_v2.
+/// @param num_external_values - the number of external inputs and outputs specified in this call. This number must
+///                              match the number of external inputs and outputs in the runtime, i.e. all external
+///                              inputs and outputs in the runtime must be specified in one call.
+/// @param external_values - array with location information for all external inputs and outputs in the runtime.
+enum xnn_status xnn_setup_runtime(
+  xnn_runtime_t runtime,
+  size_t num_external_values,
+  const struct xnn_external_value* external_values);
+
+/// Setup data pointers for external inputs and outputs in a Runtime object.
+/// Should be called after xnn_reshape_runtime.
+///
+/// @param runtime - a Runtime object created with @ref xnn_create_runtime or @ref xnn_create_runtime_v2.
+/// @param num_external_values - the number of external inputs and outputs specified in this call. This number must
+///                              match the number of external inputs and outputs in the runtime, i.e. all external
+///                              inputs and outputs in the runtime must be specified in one call.
+/// @param external_values - array with location information for all external inputs and outputs in the runtime.
+enum xnn_status xnn_setup_runtime_v2(
+  xnn_runtime_t runtime,
+  size_t num_external_values,
+  const struct xnn_external_value* external_values);
+
+/// Execute forward pass for all operators in the runtime.
+///
+/// @param runtime - the Runtime object with the execution plan to invoke.
+enum xnn_status xnn_invoke_runtime(
+  xnn_runtime_t runtime);
+
+/// Destroy a Runtime object, as well as operators and memory associated with it.
+///
+/// @param runtime - the Runtime object to destroy.
+enum xnn_status xnn_delete_runtime(
+  xnn_runtime_t runtime);
+
+typedef struct xnn_operator* xnn_operator_t;
+
+enum xnn_status xnn_run_operator(
+  xnn_operator_t op,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_delete_operator(
+  xnn_operator_t op);
+
+
+/// Operator API:
+/// - create operator will create and populate a xnn_operator_t
+/// - reshape operator will update fields in xnn_operator_t with shape/dimensions and parallelization information
+/// - setup operator will update pointers to input and outputs
+/// Each supported operator must have a create, reshape, and setup function. (Optionally a run function.)
+/// Operators listed below are in alphabetical order by operator name; within each operator, we sort alphabetically by
+/// data layout and type. We also group create, reshape, setup (and optionally run) functions of each operator together.
+
+enum xnn_status xnn_create_abs_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* abs_op_out);
+
+enum xnn_status xnn_reshape_abs_nc_f16(
+  xnn_operator_t abs_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_abs_nc_f16(
+  xnn_operator_t abs_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_abs_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* abs_op_out);
+
+enum xnn_status xnn_reshape_abs_nc_f32(
+  xnn_operator_t abs_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_abs_nc_f32(
+  xnn_operator_t abs_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_abs_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_add_nd_f16(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* add_op_out);
+
+enum xnn_status xnn_reshape_add_nd_f16(
+  xnn_operator_t add_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_add_nd_f16(
+  xnn_operator_t add_op,
+  const void* input1,
+  const void* input2,
+  void* output);
+
+enum xnn_status xnn_create_add_nd_f32(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* add_op_out);
+
+enum xnn_status xnn_reshape_add_nd_f32(
+  xnn_operator_t add_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_add_nd_f32(
+  xnn_operator_t add_op,
+  const float* input1,
+  const float* input2,
+  float* output);
+
+enum xnn_status xnn_run_add_nd_f32(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  const float* input1,
+  const float* input2,
+  float* output,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_add_nd_qs8(
+  int8_t input1_zero_point,
+  float input1_scale,
+  int8_t input2_zero_point,
+  float input2_scale,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* add_op_out);
+
+enum xnn_status xnn_reshape_add_nd_qs8(
+  xnn_operator_t add_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_add_nd_qs8(
+  xnn_operator_t add_op,
+  const int8_t* input1,
+  const int8_t* input2,
+  int8_t* output);
+
+enum xnn_status xnn_run_add_nd_qs8(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  int8_t input1_zero_point,
+  float input1_scale,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  int8_t input2_zero_point,
+  float input2_scale,
+  const int8_t* input1,
+  const int8_t* input2,
+  int8_t* output,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_add_nd_qu8(
+  uint8_t input1_zero_point,
+  float input1_scale,
+  uint8_t input2_zero_point,
+  float input2_scale,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* add_op_out);
+
+enum xnn_status xnn_reshape_add_nd_qu8(
+  xnn_operator_t add_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_add_nd_qu8(
+  xnn_operator_t add_op,
+  const uint8_t* input1,
+  const uint8_t* input2,
+  uint8_t* output);
+
+enum xnn_status xnn_run_add_nd_qu8(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  uint8_t input1_zero_point,
+  float input1_scale,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  uint8_t input2_zero_point,
+  float input2_scale,
+  const uint8_t* input1,
+  const uint8_t* input2,
+  uint8_t* output,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_argmax_pooling2d_nhwc_f32(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t flags,
+  xnn_operator_t* argmax_pooling_op_out);
+
+enum xnn_status xnn_reshape_argmax_pooling2d_nhwc_f32(
+  xnn_operator_t argmax_pooling_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_argmax_pooling2d_nhwc_f32(
+  xnn_operator_t argmax_pooling_op,
+  void* workspace,
+  const float* input,
+  float* output,
+  uint32_t* index);
+
+enum xnn_status xnn_create_average_pooling2d_nhwc_f16(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* average_pooling_op_out);
+
+enum xnn_status xnn_reshape_average_pooling2d_nhwc_f16(
+  xnn_operator_t average_pooling_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_average_pooling2d_nhwc_f16(
+  xnn_operator_t average_pooling_op,
+  void* workspace,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_average_pooling2d_nhwc_f32(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* average_pooling_op_out);
+
+enum xnn_status xnn_reshape_average_pooling2d_nhwc_f32(
+  xnn_operator_t average_pooling_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_average_pooling2d_nhwc_f32(
+  xnn_operator_t average_pooling_op,
+  void* workspace,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_average_pooling2d_nhwc_qu8(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  uint8_t input_zero_point,
+  float input_scale,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* average_pooling_op_out);
+
+enum xnn_status xnn_reshape_average_pooling2d_nhwc_qu8(
+  xnn_operator_t average_pooling_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_average_pooling2d_nhwc_qu8(
+  xnn_operator_t average_pooling_op,
+  void* workspace,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_bankers_rounding_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* rounding_op_out);
+
+enum xnn_status xnn_reshape_bankers_rounding_nc_f16(
+  xnn_operator_t rounding_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_bankers_rounding_nc_f16(
+  xnn_operator_t rounding_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_bankers_rounding_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* rounding_op_out);
+
+enum xnn_status xnn_reshape_bankers_rounding_nc_f32(
+  xnn_operator_t rounding_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_bankers_rounding_nc_f32(
+  xnn_operator_t rounding_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_bankers_rounding_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_batch_matrix_multiply_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* batch_matrix_multiply_op);
+
+enum xnn_status xnn_reshape_batch_matrix_multiply_nc_f16(
+  xnn_operator_t batch_matrix_multiply_op,
+  size_t batch_size,
+  size_t m,
+  size_t k,
+  size_t n,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_batch_matrix_multiply_nc_f16(
+  xnn_operator_t batch_matrix_multiply_op,
+  void* workspace,
+  const void* lhs_input,
+  const void* rhs_input,
+  void* output);
+
+enum xnn_status xnn_create_batch_matrix_multiply_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* batch_matrix_multiply_op);
+
+enum xnn_status xnn_reshape_batch_matrix_multiply_nc_f32(
+  xnn_operator_t batch_matrix_multiply_op,
+  size_t batch_size,
+  size_t m,
+  size_t k,
+  size_t n,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_batch_matrix_multiply_nc_f32(
+  xnn_operator_t batch_matrix_multiply_op,
+  void* workspace,
+  const float* lhs_input,
+  const float* rhs_input,
+  float* output);
+
+enum xnn_status xnn_create_ceiling_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* ceiling_op_out);
+
+enum xnn_status xnn_reshape_ceiling_nc_f16(
+  xnn_operator_t ceiling_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_ceiling_nc_f16(
+  xnn_operator_t ceiling_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_ceiling_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* ceiling_op_out);
+
+enum xnn_status xnn_run_ceiling_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_reshape_ceiling_nc_f32(
+  xnn_operator_t ceiling_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_ceiling_nc_f32(
+  xnn_operator_t ceiling_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_channel_shuffle_nc_x8(
+  size_t groups,
+  size_t group_channels,
+  size_t input_stride,
+  size_t output_stride,
+  uint32_t flags,
+  xnn_operator_t* channel_shuffle_op_out);
+
+enum xnn_status xnn_reshape_channel_shuffle_nc_x8(
+  xnn_operator_t channel_shuffle_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_channel_shuffle_nc_x8(
+  xnn_operator_t channel_shuffle_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_channel_shuffle_nc_x32(
+  size_t groups,
+  size_t group_channels,
+  size_t input_stride,
+  size_t output_stride,
+  uint32_t flags,
+  xnn_operator_t* channel_shuffle_op_out);
+
+enum xnn_status xnn_reshape_channel_shuffle_nc_x32(
+  xnn_operator_t channel_shuffle_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_channel_shuffle_nc_x32(
+  xnn_operator_t channel_shuffle_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_clamp_nc_f16(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* clamp_op_out);
+
+enum xnn_status xnn_reshape_clamp_nc_f16(
+  xnn_operator_t clamp_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_clamp_nc_f16(
+  xnn_operator_t clamp_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_clamp_nc_f32(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* clamp_op_out);
+
+enum xnn_status xnn_reshape_clamp_nc_f32(
+  xnn_operator_t clamp_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_clamp_nc_f32(
+  xnn_operator_t clamp_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_clamp_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_clamp_nc_s8(
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* clamp_op_out);
+
+enum xnn_status xnn_reshape_clamp_nc_s8(
+  xnn_operator_t clamp_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_clamp_nc_s8(
+  xnn_operator_t clamp_op,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_clamp_nc_u8(
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* clamp_op_out);
+
+enum xnn_status xnn_reshape_clamp_nc_u8(
+  xnn_operator_t clamp_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_clamp_nc_u8(
+  xnn_operator_t clamp_op,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_constant_pad_nd_x8(
+  const void* padding_value,
+  uint32_t flags,
+  xnn_operator_t* constant_pad_op_out);
+
+enum xnn_status xnn_reshape_constant_pad_nd_x8(
+  xnn_operator_t constant_pad_op,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* pre_padding,
+  const size_t* post_padding,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_constant_pad_nd_x8(
+  xnn_operator_t constant_pad_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_run_constant_pad_nd_x8(
+  uint32_t flags,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* pre_paddings,
+  const size_t* post_paddings,
+  const void* input,
+  void* output,
+  const void* padding_value,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_constant_pad_nd_x16(
+  const void* padding_value,
+  uint32_t flags,
+  xnn_operator_t* constant_pad_op_out);
+
+enum xnn_status xnn_reshape_constant_pad_nd_x16(
+  xnn_operator_t constant_pad_op,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* pre_padding,
+  const size_t* post_padding,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_constant_pad_nd_x16(
+  xnn_operator_t constant_pad_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_run_constant_pad_nd_x16(
+  uint32_t flags,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* pre_paddings,
+  const size_t* post_paddings,
+  const void* input,
+  void* output,
+  const void* padding_value,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_constant_pad_nd_x32(
+  const void* padding_value,
+  uint32_t flags,
+  xnn_operator_t* constant_pad_op_out);
+
+enum xnn_status xnn_reshape_constant_pad_nd_x32(
+  xnn_operator_t constant_pad_op,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* pre_padding,
+  const size_t* post_padding,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_constant_pad_nd_x32(
+  xnn_operator_t constant_pad_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_run_constant_pad_nd_x32(
+  uint32_t flags,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* pre_paddings,
+  const size_t* post_paddings,
+  const void* input,
+  void* output,
+  const void* padding_value,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_convert_nc_f16_f32(
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_f16_f32(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convert_nc_f16_f32(
+  xnn_operator_t convert_op,
+  const void* input,
+  float* output);
+
+enum xnn_status xnn_run_convert_nc_f16_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const void* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_convert_nc_f16_qd8(
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_f16_qd8(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+// quantization_params must be padded with at least XNN_EXTRA_QUANTIZATION_PARAMS entries.
+enum xnn_status xnn_setup_convert_nc_f16_qd8(
+  xnn_operator_t convert_op,
+  const void* input,
+  int8_t* output,
+  struct xnn_dynamic_quantization_params* quantization_params);
+
+enum xnn_status xnn_create_convert_nc_f32_qd8(
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_f32_qd8(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+// quantization_params must be padded with at least XNN_EXTRA_QUANTIZATION_PARAMS entries.
+enum xnn_status xnn_setup_convert_nc_f32_qd8(
+  xnn_operator_t convert_op,
+  const float* input,
+  int8_t* output,
+  struct xnn_dynamic_quantization_params* quantization_params);
+
+enum xnn_status xnn_create_convert_nc_f32_f16(
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_f32_f16(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convert_nc_f32_f16(
+  xnn_operator_t convert_op,
+  const float* input,
+  void* output);
+
+enum xnn_status xnn_run_convert_nc_f32_f16(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  void* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_convert_nc_f32_qs8(
+  float output_scale,
+  int8_t output_zero_point,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_f32_qs8(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convert_nc_f32_qs8(
+  xnn_operator_t convert_op,
+  const float* input,
+  int8_t* output);
+
+enum xnn_status xnn_run_convert_nc_f32_qs8(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  int8_t* output,
+  float output_scale,
+  int8_t output_zero_point,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_convert_nc_f32_qu8(
+  float output_scale,
+  uint8_t output_zero_point,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_f32_qu8(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convert_nc_f32_qu8(
+  xnn_operator_t convert_op,
+  const float* input,
+  uint8_t* output);
+
+enum xnn_status xnn_run_convert_nc_f32_qu8(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  uint8_t* output,
+  float output_scale,
+  uint8_t output_zero_point,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_convert_nc_qs8(
+  float input_scale,
+  int8_t input_zero_point,
+  float output_scale,
+  int8_t output_zero_point,
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_qs8(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convert_nc_qs8(
+  xnn_operator_t convert_op,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_convert_nc_qs8_f16(
+  float input_scale,
+  int8_t input_zero_point,
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_qs8_f16(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convert_nc_qs8_f16(
+  xnn_operator_t convert_op,
+  const int8_t* input,
+  void* output);
+
+enum xnn_status xnn_create_convert_nc_qs8_f32(
+  float input_scale,
+  int8_t input_zero_point,
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_qs8_f32(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convert_nc_qs8_f32(
+  xnn_operator_t convert_op,
+  const int8_t* input,
+  float* output);
+
+enum xnn_status xnn_run_convert_nc_qs8_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const int8_t* input,
+  float* output,
+  float input_scale,
+  int8_t input_zero_point,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_convert_nc_qs16_qs8(
+  float input_scale,
+  float output_scale,
+  int8_t output_zero_point,
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_qs16_qs8(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convert_nc_qs16_qs8(
+  xnn_operator_t convert_op,
+  const int16_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_run_convert_nc_qs16_qs8(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const int16_t* input,
+  int8_t* output,
+  float input_scale,
+  float output_scale,
+  int8_t output_zero_point,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_convert_nc_qu8(
+  float input_scale,
+  uint8_t input_zero_point,
+  float output_scale,
+  uint8_t output_zero_point,
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_qu8(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convert_nc_qu8(
+  xnn_operator_t convert_op,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_convert_nc_qu8_f32(
+  float input_scale,
+  uint8_t input_zero_point,
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_qu8_f32(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convert_nc_qu8_f32(
+  xnn_operator_t convert_op,
+  const uint8_t* input,
+  float* output);
+
+enum xnn_status xnn_run_convert_nc_qu8_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const uint8_t* input,
+  float* output,
+  float input_scale,
+  uint8_t input_zero_point,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_convolution2d_nchw_f16(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t subsampling_height,
+  uint32_t subsampling_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_channel_stride,
+  size_t output_channel_stride,
+  const void* kernel,
+  const void* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* convolution_op_out);
+
+enum xnn_status xnn_reshape_convolution2d_nchw_f16(
+  xnn_operator_t convolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convolution2d_nchw_f16(
+  xnn_operator_t convolution_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_convolution2d_nchw_f32(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t subsampling_height,
+  uint32_t subsampling_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_channel_stride,
+  size_t output_channel_stride,
+  const float* kernel,
+  const float* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* convolution_op_out);
+
+enum xnn_status xnn_reshape_convolution2d_nchw_f32(
+  xnn_operator_t convolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convolution2d_nchw_f32(
+  xnn_operator_t convolution_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_convolution2d_nhwc_f16(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t subsampling_height,
+  uint32_t subsampling_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_channel_stride,
+  size_t output_channel_stride,
+  const void* kernel,
+  const void* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* convolution_op_out);
+
+enum xnn_status xnn_reshape_convolution2d_nhwc_f16(
+  xnn_operator_t convolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convolution2d_nhwc_f16(
+  xnn_operator_t convolution_op,
+  void* workspace,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_convolution2d_nhwc_f32(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t subsampling_height,
+  uint32_t subsampling_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_channel_stride,
+  size_t output_channel_stride,
+  const float* kernel,
+  const float* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* convolution_op_out);
+
+// Forward declare.
+struct xnn_post_operation;
+
+/// Create a convolution operator with a number of post operations. The
+/// convolution operator created using this function does not have output_min
+/// and output_max. The list of operators in post_operations will be applied in
+/// order. Convolution with post operations is only supported on JIT platforms
+/// and when JIT is enabled.
+enum xnn_status xnn_create_fused_convolution2d_nhwc_f32(
+    uint32_t input_padding_top,
+    uint32_t input_padding_right,
+    uint32_t input_padding_bottom,
+    uint32_t input_padding_left,
+    uint32_t kernel_height,
+    uint32_t kernel_width,
+    uint32_t subsampling_height,
+    uint32_t subsampling_width,
+    uint32_t dilation_height,
+    uint32_t dilation_width,
+    uint32_t groups,
+    size_t group_input_channels,
+    size_t group_output_channels,
+    size_t input_channel_stride,
+    size_t output_channel_stride,
+    const float* kernel,
+    const float* bias,
+    size_t num_post_operations,
+    struct xnn_post_operation* post_operations,
+    uint32_t flags,
+    xnn_code_cache_t code_cache,
+    xnn_weights_cache_t weights_cache,
+    xnn_operator_t* convolution_op_out);
+
+enum xnn_status xnn_reshape_convolution2d_nhwc_f32(
+  xnn_operator_t convolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convolution2d_nhwc_f32(
+  xnn_operator_t convolution_op,
+  void* workspace,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_convolution2d_nhwc_qd8_f16_qc8w(
+    uint32_t input_padding_top, uint32_t input_padding_right,
+    uint32_t input_padding_bottom, uint32_t input_padding_left,
+    uint32_t kernel_height, uint32_t kernel_width, uint32_t subsampling_height,
+    uint32_t subsampling_width, uint32_t dilation_height,
+    uint32_t dilation_width, uint32_t groups, size_t group_input_channels,
+    size_t group_output_channels, size_t input_channel_stride,
+    size_t output_channel_stride, const float* kernel_scale,
+    const int8_t* kernel, const float* bias, float output_min, float output_max,
+    uint32_t flags, xnn_code_cache_t code_cache,
+    xnn_weights_cache_t weights_cache, xnn_operator_t* convolution_op_out);
+
+enum xnn_status xnn_create_convolution2d_nhwc_qd8_f32_qc8w(
+    uint32_t input_padding_top, uint32_t input_padding_right,
+    uint32_t input_padding_bottom, uint32_t input_padding_left,
+    uint32_t kernel_height, uint32_t kernel_width, uint32_t subsampling_height,
+    uint32_t subsampling_width, uint32_t dilation_height,
+    uint32_t dilation_width, uint32_t groups, size_t group_input_channels,
+    size_t group_output_channels, size_t input_channel_stride,
+    size_t output_channel_stride, const float* kernel_scale,
+    const int8_t* kernel, const float* bias, float output_min, float output_max,
+    uint32_t flags, xnn_code_cache_t code_cache,
+    xnn_weights_cache_t weights_cache, xnn_operator_t* convolution_op_out);
+
+enum xnn_status xnn_create_convolution2d_nhwc_qs8(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t subsampling_height,
+  uint32_t subsampling_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_channel_stride,
+  size_t output_channel_stride,
+  int8_t input_zero_point,
+  float input_scale,
+  float kernel_scale,
+  const int8_t* kernel,
+  const int32_t* bias,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* convolution_op_out);
+
+enum xnn_status xnn_reshape_convolution2d_nhwc_qd8_f16_qc8w(
+    xnn_operator_t convolution_op, size_t batch_size, size_t input_height,
+    size_t input_width, size_t* workspace_size, size_t* workspace_alignment,
+    size_t* output_height_out, size_t* output_width_out,
+    pthreadpool_t threadpool);
+
+enum xnn_status xnn_reshape_convolution2d_nhwc_qd8_f32_qc8w(
+    xnn_operator_t convolution_op, size_t batch_size, size_t input_height,
+    size_t input_width, size_t* workspace_size, size_t* workspace_alignment,
+    size_t* output_height_out, size_t* output_width_out,
+    pthreadpool_t threadpool);
+
+enum xnn_status xnn_reshape_convolution2d_nhwc_qs8(
+  xnn_operator_t convolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convolution2d_nhwc_qd8_f16_qc8w(
+    xnn_operator_t convolution_op, void* workspace, const int8_t* input,
+    void* output,
+    const struct xnn_dynamic_quantization_params* quantization_params);
+
+enum xnn_status xnn_setup_convolution2d_nhwc_qd8_f32_qc8w(
+    xnn_operator_t convolution_op, void* workspace, const int8_t* input,
+    float* output,
+    const struct xnn_dynamic_quantization_params* quantization_params);
+
+enum xnn_status xnn_setup_convolution2d_nhwc_qs8(
+  xnn_operator_t convolution_op,
+  void* workspace,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_convolution2d_nhwc_qs8_qc8w(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t subsampling_height,
+  uint32_t subsampling_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_channel_stride,
+  size_t output_channel_stride,
+  int8_t input_zero_point,
+  float input_scale,
+  const float* kernel_scale,
+  const int8_t* kernel,
+  const int32_t* bias,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* convolution_op_out);
+
+enum xnn_status xnn_reshape_convolution2d_nhwc_qs8_qc8w(
+  xnn_operator_t convolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convolution2d_nhwc_qs8_qc8w(
+  xnn_operator_t convolution_op,
+  void* workspace,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_convolution2d_nhwc_qu8(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t subsampling_height,
+  uint32_t subsampling_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_channel_stride,
+  size_t output_channel_stride,
+  uint8_t input_zero_point,
+  float input_scale,
+  uint8_t kernel_zero_point,
+  float kernel_scale,
+  const uint8_t* kernel,
+  const int32_t* bias,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* convolution_op_out);
+
+enum xnn_status xnn_reshape_convolution2d_nhwc_qu8(
+  xnn_operator_t convolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convolution2d_nhwc_qu8(
+  xnn_operator_t convolution_op,
+  void* workspace,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_copy_nc_x8(
+  uint32_t flags,
+  xnn_operator_t* copy_op_out);
+
+enum xnn_status xnn_reshape_copy_nc_x8(
+  xnn_operator_t copy_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_copy_nc_x8(
+  xnn_operator_t copy_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_copy_nc_x16(
+  uint32_t flags,
+  xnn_operator_t* copy_op_out);
+
+enum xnn_status xnn_reshape_copy_nc_x16(
+  xnn_operator_t copy_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_copy_nc_x16(
+  xnn_operator_t copy_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_copy_nc_x32(
+  uint32_t flags,
+  xnn_operator_t* copy_op_out);
+
+enum xnn_status xnn_reshape_copy_nc_x32(
+  xnn_operator_t copy_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_copy_nc_x32(
+  xnn_operator_t copy_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_run_copy_nc_x32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const uint32_t* input,
+  uint32_t* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_deconvolution2d_nhwc_f16(
+  uint32_t output_padding_top,
+  uint32_t output_padding_right,
+  uint32_t output_padding_bottom,
+  uint32_t output_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  const void* kernel,
+  const void* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* deconvolution_op_out);
+
+enum xnn_status xnn_reshape_deconvolution2d_nhwc_f16(
+  xnn_operator_t deconvolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  uint32_t adjustment_height,
+  uint32_t adjustment_width,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_deconvolution2d_nhwc_f16(
+  xnn_operator_t deconvolution_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_deconvolution2d_nhwc_f32(
+  uint32_t output_padding_top,
+  uint32_t output_padding_right,
+  uint32_t output_padding_bottom,
+  uint32_t output_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  const float* kernel,
+  const float* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* deconvolution_op_out);
+
+enum xnn_status xnn_reshape_deconvolution2d_nhwc_f32(
+  xnn_operator_t deconvolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  uint32_t adjustment_height,
+  uint32_t adjustment_width,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_deconvolution2d_nhwc_f32(
+  xnn_operator_t deconvolution_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_deconvolution2d_nhwc_qs8(
+  uint32_t output_padding_top,
+  uint32_t output_padding_right,
+  uint32_t output_padding_bottom,
+  uint32_t output_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  int8_t input_zero_point,
+  float input_scale,
+  float kernel_scale,
+  const int8_t* kernel,
+  const int32_t* bias,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* deconvolution_op_out);
+
+enum xnn_status xnn_reshape_deconvolution2d_nhwc_qs8(
+  xnn_operator_t deconvolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  uint32_t adjustment_height,
+  uint32_t adjustment_width,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_deconvolution2d_nhwc_qs8(
+  xnn_operator_t deconvolution_op,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_deconvolution2d_nhwc_qu8(
+  uint32_t output_padding_top,
+  uint32_t output_padding_right,
+  uint32_t output_padding_bottom,
+  uint32_t output_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  uint8_t input_zero_point,
+  float input_scale,
+  uint8_t kernel_zero_point,
+  float kernel_scale,
+  const uint8_t* kernel,
+  const int32_t* bias,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* deconvolution_op_out);
+
+enum xnn_status xnn_reshape_deconvolution2d_nhwc_qu8(
+  xnn_operator_t deconvolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  uint32_t adjustment_height,
+  uint32_t adjustment_width,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_deconvolution2d_nhwc_qu8(
+  xnn_operator_t deconvolution_op,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_depth_to_space_nchw2nhwc_x16(
+  uint32_t block_size,
+  uint32_t flags,
+  xnn_operator_t* depth_to_space_op_out);
+
+enum xnn_status xnn_reshape_depth_to_space_nchw2nhwc_x16(
+  xnn_operator_t depth_to_space_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t input_channels,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  size_t* output_channels_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_depth_to_space_nchw2nhwc_x16(
+  xnn_operator_t depth_to_space_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_depth_to_space_nchw2nhwc_x32(
+  uint32_t block_size,
+  uint32_t flags,
+  xnn_operator_t* depth_to_space_op_out);
+
+enum xnn_status xnn_reshape_depth_to_space_nchw2nhwc_x32(
+  xnn_operator_t depth_to_space_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t input_channels,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  size_t* output_channels_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_depth_to_space_nchw2nhwc_x32(
+  xnn_operator_t depth_to_space_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_depth_to_space_nhwc_x8(
+  uint32_t block_size,
+  uint32_t flags,
+  xnn_operator_t* depth_to_space_op_out);
+
+enum xnn_status xnn_reshape_depth_to_space_nhwc_x8(
+  xnn_operator_t depth_to_space_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t input_channels,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  size_t* output_channels_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_depth_to_space_nhwc_x8(
+  xnn_operator_t depth_to_space_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_depth_to_space_nhwc_x16(
+  uint32_t block_size,
+  uint32_t flags,
+  xnn_operator_t* depth_to_space_op_out);
+
+enum xnn_status xnn_reshape_depth_to_space_nhwc_x16(
+  xnn_operator_t depth_to_space_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t input_channels,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  size_t* output_channels_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_depth_to_space_nhwc_x16(
+  xnn_operator_t depth_to_space_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_depth_to_space_nhwc_x32(
+  uint32_t block_size,
+  uint32_t flags,
+  xnn_operator_t* depth_to_space_op_out);
+
+enum xnn_status xnn_reshape_depth_to_space_nhwc_x32(
+  xnn_operator_t depth_to_space_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t input_channels,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  size_t* output_channels_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_depth_to_space_nhwc_x32(
+  xnn_operator_t depth_to_space_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_divide_nd_f16(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* divide_op_out);
+
+enum xnn_status xnn_reshape_divide_nd_f16(
+  xnn_operator_t divide_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_divide_nd_f16(
+  xnn_operator_t divide_op,
+  const void* input1,
+  const void* input2,
+  void* output);
+
+enum xnn_status xnn_create_divide_nd_f32(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* divide_op_out);
+
+enum xnn_status xnn_reshape_divide_nd_f32(
+  xnn_operator_t divide_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_divide_nd_f32(
+  xnn_operator_t divide_op,
+  const float* input1,
+  const float* input2,
+  float* output);
+
+enum xnn_status xnn_run_divide_nd_f32(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  const float* input1,
+  const float* input2,
+  float* output,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_dynamic_fully_connected_nc_f16(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* dynamic_fully_connected_op_out);
+
+enum xnn_status xnn_reshape_dynamic_fully_connected_nc_f16(
+  xnn_operator_t dynamic_fully_connected_op,
+  size_t batch_size,
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_dynamic_fully_connected_nc_f16(
+  xnn_operator_t dynamic_fully_connected_op,
+  void* workspace,
+  const void* input,
+  const void* kernel,
+  const void* bias,
+  void* output);
+
+enum xnn_status xnn_create_dynamic_fully_connected_nc_f32(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* dynamic_fully_connected_op_out);
+
+enum xnn_status xnn_reshape_dynamic_fully_connected_nc_f32(
+  xnn_operator_t dynamic_fully_connected_op,
+  size_t batch_size,
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_dynamic_fully_connected_nc_f32(
+  xnn_operator_t dynamic_fully_connected_op,
+  void* workspace,
+  const float* input,
+  const float* kernel,
+  const float* bias,
+  float* output);
+
+enum xnn_status xnn_create_elu_nc_f16(
+  float alpha,
+  uint32_t flags,
+  xnn_operator_t* elu_op_out);
+
+enum xnn_status xnn_reshape_elu_nc_f16(
+  xnn_operator_t elu_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_elu_nc_f16(
+  xnn_operator_t elu_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_elu_nc_f32(
+  float alpha,
+  uint32_t flags,
+  xnn_operator_t* elu_op_out);
+
+enum xnn_status xnn_reshape_elu_nc_f32(
+  xnn_operator_t elu_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_elu_nc_f32(
+  xnn_operator_t elu_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_elu_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  float alpha,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_elu_nc_qs8(
+  float alpha,
+  int8_t input_zero_point,
+  float input_scale,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* elu_op_out);
+
+enum xnn_status xnn_reshape_elu_nc_qs8(
+  xnn_operator_t elu_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_elu_nc_qs8(
+  xnn_operator_t elu_op,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_floor_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* floor_op_out);
+
+enum xnn_status xnn_reshape_floor_nc_f16(
+  xnn_operator_t floor_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_floor_nc_f16(
+  xnn_operator_t floor_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_floor_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* floor_op_out);
+
+enum xnn_status xnn_reshape_floor_nc_f32(
+  xnn_operator_t floor_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_floor_nc_f32(
+  xnn_operator_t floor_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_floor_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_fully_connected_nc_f16(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  const void* kernel,
+  const void* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_reshape_fully_connected_nc_f16(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_fully_connected_nc_f16(
+  xnn_operator_t fully_connected_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_fully_connected_nc_f32(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  const float* kernel,
+  const float* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_reshape_fully_connected_nc_f32(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_fully_connected_nc_f32(
+  xnn_operator_t fully_connected_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_fully_connected_nc_f32_qc4w(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  uint8_t kernel_zero_point,
+  const float* kernel_scale,
+  const uint8_t* kernel,
+  const float* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_reshape_fully_connected_nc_f32_qc4w(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_fully_connected_nc_f32_qc4w(
+  xnn_operator_t fully_connected_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_fully_connected_nc_f32_qc8w(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  const float* kernel_scale,
+  const int8_t* kernel,
+  const float* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_reshape_fully_connected_nc_f32_qc8w(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_fully_connected_nc_f32_qc8w(
+  xnn_operator_t fully_connected_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_fully_connected_nc_qd8_f16_qc4w(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  uint8_t kernel_zero_point,
+  const float* kernel_scale,
+  const void* kernel,
+  const float* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_setup_fully_connected_nc_qd8_f16_qc4w(
+  xnn_operator_t fully_connected_op,
+  const int8_t* input,
+  void* output,
+  const struct xnn_dynamic_quantization_params* quantization_params);
+
+enum xnn_status xnn_reshape_fully_connected_nc_qd8_f16_qc4w(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_fully_connected_nc_qd8_f32_qc4w(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  uint8_t kernel_zero_point,
+  const float* kernel_scale,
+  const void* kernel,
+  const float* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_setup_fully_connected_nc_qd8_f32_qc4w(
+  xnn_operator_t fully_connected_op,
+  const int8_t* input,
+  float* output,
+  const struct xnn_dynamic_quantization_params* quantization_params);
+
+enum xnn_status xnn_reshape_fully_connected_nc_qd8_f32_qc4w(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_fully_connected_nc_qd8_f16_qc8w(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  const float* kernel_scale,
+  const int8_t* kernel,
+  const float* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_setup_fully_connected_nc_qd8_f16_qc8w(
+  xnn_operator_t fully_connected_op,
+  const int8_t* input,
+  void* output,
+  const struct xnn_dynamic_quantization_params* quantization_params);
+
+enum xnn_status xnn_reshape_fully_connected_nc_qd8_f16_qc8w(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_fully_connected_nc_qd8_f32_qc8w(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  const float* kernel_scale,
+  const int8_t* kernel,
+  const float* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_setup_fully_connected_nc_qd8_f32_qc8w(
+  xnn_operator_t fully_connected_op,
+  const int8_t* input,
+  float* output,
+  const struct xnn_dynamic_quantization_params* quantization_params);
+
+enum xnn_status xnn_reshape_fully_connected_nc_qd8_f32_qc8w(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_fully_connected_nc_qs8(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  int8_t input_zero_point,
+  float input_scale,
+  float kernel_scale,
+  const int8_t* kernel,
+  const int32_t* bias,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_reshape_fully_connected_nc_qs8(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_fully_connected_nc_qs8(
+  xnn_operator_t fully_connected_op,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_fully_connected_nc_qs8_qc8w(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  int8_t input_zero_point,
+  float input_scale,
+  const float* kernel_scale,
+  const int8_t* kernel,
+  const int32_t* bias,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_reshape_fully_connected_nc_qs8_qc8w(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_fully_connected_nc_qs8_qc8w(
+  xnn_operator_t fully_connected_op,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_fully_connected_nc_qu8(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  uint8_t input_zero_point,
+  float input_scale,
+  uint8_t kernel_zero_point,
+  float kernel_scale,
+  const uint8_t* kernel,
+  const int32_t* bias,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_reshape_fully_connected_nc_qu8(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_fully_connected_nc_qu8(
+  xnn_operator_t fully_connected_op,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_global_average_pooling_ncw_f16(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* global_average_pooling_op_out);
+
+enum xnn_status xnn_reshape_global_average_pooling_ncw_f16(
+  xnn_operator_t global_average_pooling_op,
+  size_t batch_size,
+  size_t width,
+  size_t channels,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_global_average_pooling_ncw_f16(
+  xnn_operator_t global_average_pooling_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_global_average_pooling_ncw_f32(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* global_average_pooling_op_out);
+
+enum xnn_status xnn_reshape_global_average_pooling_ncw_f32(
+  xnn_operator_t global_average_pooling_op,
+  size_t batch_size,
+  size_t width,
+  size_t channels,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_global_average_pooling_ncw_f32(
+  xnn_operator_t global_average_pooling_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_global_average_pooling_nwc_f16(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* global_average_pooling_op_out);
+
+enum xnn_status xnn_reshape_global_average_pooling_nwc_f16(
+  xnn_operator_t global_average_pooling_op,
+  size_t batch_size,
+  size_t width,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_global_average_pooling_nwc_f16(
+  xnn_operator_t global_average_pooling_op,
+  void* workspace,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_global_average_pooling_nwc_f32(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* global_average_pooling_op_out);
+
+enum xnn_status xnn_reshape_global_average_pooling_nwc_f32(
+  xnn_operator_t global_average_pooling_op,
+  size_t batch_size,
+  size_t width,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_global_average_pooling_nwc_f32(
+  xnn_operator_t global_average_pooling_op,
+  void* workspace,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_global_average_pooling_nwc_qs8(
+  int8_t input_zero_point,
+  float input_scale,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* global_average_pooling_op_out);
+
+enum xnn_status xnn_reshape_global_average_pooling_nwc_qs8(
+  xnn_operator_t global_average_pooling_op,
+  size_t batch_size,
+  size_t width,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_global_average_pooling_nwc_qs8(
+  xnn_operator_t global_average_pooling_op,
+  void* workspace,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_global_average_pooling_nwc_qu8(
+  uint8_t input_zero_point,
+  float input_scale,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* global_average_pooling_op_out);
+
+enum xnn_status xnn_reshape_global_average_pooling_nwc_qu8(
+  xnn_operator_t global_average_pooling_op,
+  size_t batch_size,
+  size_t width,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_global_average_pooling_nwc_qu8(
+  xnn_operator_t global_average_pooling_op,
+  void* workspace,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_global_sum_pooling_nwc_f16(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* global_sum_pooling_op_out);
+
+enum xnn_status xnn_reshape_global_sum_pooling_nwc_f16(
+  xnn_operator_t global_sum_pooling_op,
+  size_t batch_size,
+  size_t width,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_global_sum_pooling_nwc_f16(
+  xnn_operator_t global_sum_pooling_op,
+  void* workspace,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_global_sum_pooling_nwc_f32(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* global_sum_pooling_op_out);
+
+enum xnn_status xnn_reshape_global_sum_pooling_nwc_f32(
+  xnn_operator_t global_sum_pooling_op,
+  size_t batch_size,
+  size_t width,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_global_sum_pooling_nwc_f32(
+  xnn_operator_t global_sum_pooling_op,
+  void* workspace,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_hardswish_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* hardswish_op_out);
+
+enum xnn_status xnn_reshape_hardswish_nc_f16(
+  xnn_operator_t hardswish_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_hardswish_nc_f16(
+  xnn_operator_t hardswish_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_hardswish_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* hardswish_op_out);
+
+enum xnn_status xnn_reshape_hardswish_nc_f32(
+  xnn_operator_t hardswish_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_hardswish_nc_f32(
+  xnn_operator_t hardswish_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_hardswish_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_leaky_relu_nc_f16(
+  float negative_slope,
+  uint32_t flags,
+  xnn_operator_t* leaky_relu_op_out);
+
+enum xnn_status xnn_reshape_leaky_relu_nc_f16(
+  xnn_operator_t leaky_relu_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_leaky_relu_nc_f16(
+  xnn_operator_t leaky_relu_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_leaky_relu_nc_f32(
+  float negative_slope,
+  uint32_t flags,
+  xnn_operator_t* leaky_relu_op_out);
+
+enum xnn_status xnn_reshape_leaky_relu_nc_f32(
+  xnn_operator_t leaky_relu_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_leaky_relu_nc_f32(
+  xnn_operator_t leaky_relu_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_leaky_relu_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  float negative_slope,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_leaky_relu_nc_qs8(
+  float negative_slope,
+  int8_t input_zero_point,
+  float input_scale,
+  int8_t output_zero_point,
+  float output_scale,
+  uint32_t flags,
+  xnn_operator_t* leaky_relu_op_out);
+
+enum xnn_status xnn_reshape_leaky_relu_nc_qs8(
+  xnn_operator_t leaky_relu_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_leaky_relu_nc_qs8(
+  xnn_operator_t leaky_relu_op,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_leaky_relu_nc_qu8(
+  float negative_slope,
+  uint8_t input_zero_point,
+  float input_scale,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint32_t flags,
+  xnn_operator_t* leaky_relu_op_out);
+
+enum xnn_status xnn_reshape_leaky_relu_nc_qu8(
+  xnn_operator_t leaky_relu_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_leaky_relu_nc_qu8(
+  xnn_operator_t leaky_relu_op,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_max_pooling2d_nhwc_f16(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* max_pooling_op_out);
+
+enum xnn_status xnn_reshape_max_pooling2d_nhwc_f16(
+  xnn_operator_t max_pooling_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_max_pooling2d_nhwc_f16(
+  xnn_operator_t max_pooling_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_max_pooling2d_nhwc_f32(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* max_pooling_op_out);
+
+enum xnn_status xnn_reshape_max_pooling2d_nhwc_f32(
+  xnn_operator_t max_pooling_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_max_pooling2d_nhwc_f32(
+  xnn_operator_t max_pooling_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_max_pooling2d_nhwc_s8(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* max_pooling_op_out);
+
+enum xnn_status xnn_reshape_max_pooling2d_nhwc_s8(
+  xnn_operator_t max_pooling_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_max_pooling2d_nhwc_s8(
+  xnn_operator_t max_pooling_op,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_max_pooling2d_nhwc_u8(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* max_pooling_op_out);
+
+enum xnn_status xnn_reshape_max_pooling2d_nhwc_u8(
+  xnn_operator_t max_pooling_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_max_pooling2d_nhwc_u8(
+  xnn_operator_t max_pooling_op,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_maximum_nd_f16(
+  uint32_t flags,
+  xnn_operator_t* maximum_op_out);
+
+enum xnn_status xnn_reshape_maximum_nd_f16(
+  xnn_operator_t maximum_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_maximum_nd_f16(
+  xnn_operator_t maximum_op,
+  const void* input1,
+  const void* input2,
+  void* output);
+
+enum xnn_status xnn_create_maximum_nd_f32(
+  uint32_t flags,
+  xnn_operator_t* maximum_op_out);
+
+enum xnn_status xnn_reshape_maximum_nd_f32(
+  xnn_operator_t maximum_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_maximum_nd_f32(
+  xnn_operator_t maximum_op,
+  const float* input1,
+  const float* input2,
+  float* output);
+
+enum xnn_status xnn_run_maximum_nd_f32(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  const float* input1,
+  const float* input2,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_mean_nd_f16(
+  uint32_t flags,
+  xnn_operator_t* mean_op_out);
+
+enum xnn_status xnn_reshape_mean_nd_f16(
+  xnn_operator_t mean_op,
+  size_t num_reduction_axes,
+  const size_t* reduction_axes,
+  size_t num_input_dims,
+  const size_t* input_shape,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_mean_nd_f16(
+  xnn_operator_t mean_op,
+  void* workspace,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_mean_nd_f32(
+  uint32_t flags,
+  xnn_operator_t* mean_op_out);
+
+enum xnn_status xnn_reshape_mean_nd_f32(
+  xnn_operator_t mean_op,
+  size_t num_reduction_axes,
+  const size_t* reduction_axes,
+  size_t num_input_dims,
+  const size_t* input_shape,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_mean_nd_f32(
+  xnn_operator_t mean_op,
+  void* workspace,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_minimum_nd_f16(
+  uint32_t flags,
+  xnn_operator_t* minimum_op_out);
+
+enum xnn_status xnn_reshape_minimum_nd_f16(
+  xnn_operator_t minimum_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_minimum_nd_f16(
+  xnn_operator_t minimum_op,
+  const void* input1,
+  const void* input2,
+  void* output);
+
+enum xnn_status xnn_create_minimum_nd_f32(
+  uint32_t flags,
+  xnn_operator_t* minimum_op_out);
+
+enum xnn_status xnn_reshape_minimum_nd_f32(
+  xnn_operator_t minimum_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_minimum_nd_f32(
+  xnn_operator_t minimum_op,
+  const float* input1,
+  const float* input2,
+  float* output);
+
+enum xnn_status xnn_run_minimum_nd_f32(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  const float* input1,
+  const float* input2,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_multiply_nd_f16(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* multiply_op_out);
+
+enum xnn_status xnn_reshape_multiply_nd_f16(
+  xnn_operator_t multiply_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_multiply_nd_f16(
+  xnn_operator_t multiply_op,
+  const void* input1,
+  const void* input2,
+  void* output);
+
+enum xnn_status xnn_create_multiply_nd_f32(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* multiply_op_out);
+
+enum xnn_status xnn_reshape_multiply_nd_f32(
+  xnn_operator_t multiply_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_multiply_nd_f32(
+  xnn_operator_t multiply_op,
+  const float* input1,
+  const float* input2,
+  float* output);
+
+enum xnn_status xnn_run_multiply_nd_f32(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  const float* input1,
+  const float* input2,
+  float* output,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_multiply_nd_qs8(
+  int8_t input1_zero_point,
+  float input1_scale,
+  int8_t input2_zero_point,
+  float input2_scale,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* multiply_op_out);
+
+enum xnn_status xnn_reshape_multiply_nd_qs8(
+  xnn_operator_t multiply_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_multiply_nd_qs8(
+  xnn_operator_t multiply_op,
+  const int8_t* input1,
+  const int8_t* input2,
+  int8_t* output);
+
+enum xnn_status xnn_run_multiply_nd_qs8(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  int8_t input1_zero_point,
+  float input1_scale,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  int8_t input2_zero_point,
+  float input2_scale,
+  const int8_t* input1,
+  const int8_t* input2,
+  int8_t* output,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_multiply_nd_qu8(
+  uint8_t input1_zero_point,
+  float input1_scale,
+  uint8_t input2_zero_point,
+  float input2_scale,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* multiply_op_out);
+
+enum xnn_status xnn_reshape_multiply_nd_qu8(
+  xnn_operator_t multiply_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_multiply_nd_qu8(
+  xnn_operator_t multiply_op,
+  const uint8_t* input1,
+  const uint8_t* input2,
+  uint8_t* output);
+
+enum xnn_status xnn_run_multiply_nd_qu8(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  uint8_t input1_zero_point,
+  float input1_scale,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  uint8_t input2_zero_point,
+  float input2_scale,
+  const uint8_t* input1,
+  const uint8_t* input2,
+  uint8_t* output,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_negate_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* negate_op_out);
+
+enum xnn_status xnn_reshape_negate_nc_f16(
+  xnn_operator_t negate_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_negate_nc_f16(
+  xnn_operator_t negate_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_negate_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* negate_op_out);
+
+enum xnn_status xnn_reshape_negate_nc_f32(
+  xnn_operator_t negate_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_negate_nc_f32(
+  xnn_operator_t negate_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_negate_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_prelu_nc_f16(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  const void* negative_slope,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* prelu_op_out);
+
+enum xnn_status xnn_reshape_prelu_nc_f16(
+  xnn_operator_t prelu_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_prelu_nc_f16(
+  xnn_operator_t prelu_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_prelu_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  const float* negative_slope,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* prelu_op_out);
+
+enum xnn_status xnn_reshape_prelu_nc_f32(
+  xnn_operator_t prelu_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_prelu_nc_f32(
+  xnn_operator_t prelu_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_resize_bilinear2d_nchw_f32(
+  size_t output_height,
+  size_t output_width,
+  uint32_t flags,
+  xnn_operator_t* resize_op_out);
+
+enum xnn_status xnn_reshape_resize_bilinear2d_nchw_f32(
+  xnn_operator_t resize_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_resize_bilinear2d_nchw_f32(
+  xnn_operator_t resize_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_resize_bilinear2d_nchw_f16(
+  size_t output_height,
+  size_t output_width,
+  uint32_t flags,
+  xnn_operator_t* resize_op_out);
+
+enum xnn_status xnn_reshape_resize_bilinear2d_nchw_f16(
+  xnn_operator_t resize_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_resize_bilinear2d_nchw_f16(
+  xnn_operator_t resize_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_resize_bilinear2d_nhwc_f16(
+  size_t output_height,
+  size_t output_width,
+  uint32_t flags,
+  xnn_operator_t* resize_op_out);
+
+enum xnn_status xnn_reshape_resize_bilinear2d_nhwc_f16(
+  xnn_operator_t resize_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_resize_bilinear2d_nhwc_f16(
+  xnn_operator_t resize_op,
+  void* workspace,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_resize_bilinear2d_nhwc_f32(
+  size_t output_height,
+  size_t output_width,
+  uint32_t flags,
+  xnn_operator_t* resize_op_out);
+
+enum xnn_status xnn_reshape_resize_bilinear2d_nhwc_f32(
+  xnn_operator_t resize_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_resize_bilinear2d_nhwc_f32(
+  xnn_operator_t resize_op,
+  void* workspace,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_resize_bilinear2d_nhwc_s8(
+  size_t output_height,
+  size_t output_width,
+  uint32_t flags,
+  xnn_operator_t* resize_op_out);
+
+enum xnn_status xnn_reshape_resize_bilinear2d_nhwc_s8(
+  xnn_operator_t resize_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* workspace_size,
+  size_t* workspace,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_resize_bilinear2d_nhwc_s8(
+  xnn_operator_t resize_op,
+  void* workspace,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_resize_bilinear2d_nhwc_u8(
+  size_t output_height,
+  size_t output_width,
+  uint32_t flags,
+  xnn_operator_t* resize_op_out);
+
+enum xnn_status xnn_reshape_resize_bilinear2d_nhwc_u8(
+  xnn_operator_t resize_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_resize_bilinear2d_nhwc_u8(
+  xnn_operator_t resize_op,
+  void* workspace,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_rope_nthc_f16(
+  size_t max_tokens,
+  uint32_t flags,
+  xnn_operator_t* rope_op_out);
+
+enum xnn_status xnn_reshape_rope_nthc_f16(
+  xnn_operator_t rope_op,
+  size_t batch_size,
+  size_t tokens,
+  size_t heads,
+  size_t channels,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_rope_nthc_f16(
+  xnn_operator_t rope_op,
+  const void* input,
+  const void* weights,
+  void* output);
+
+enum xnn_status xnn_create_rope_nthc_f32(
+  size_t max_tokens,
+  uint32_t flags,
+  xnn_operator_t* rope_op_out);
+
+enum xnn_status xnn_reshape_rope_nthc_f32(
+  xnn_operator_t rope_op,
+  size_t batch_size,
+  size_t tokens,
+  size_t heads,
+  size_t channels,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_rope_nthc_f32(
+  xnn_operator_t rope_op,
+  const float* input,
+  const float* weights,
+  float* output);
+
+// N: batch size
+// H: number of heads
+// T: tokens (sequence length)
+// C: channels (head dimension)
+enum xnn_status xnn_create_scaled_dot_product_attention_nhtc_f16(
+  enum xnn_attention_logits_cap_type cap_type,
+  const void* cap_params,
+  uint32_t flags,
+  xnn_operator_t* attention_op_out);
+
+enum xnn_status xnn_reshape_scaled_dot_product_attention_nhtc_f16(
+  xnn_operator_t attention_op,
+  size_t batch_size,
+  size_t query_heads,
+  // Number of tokens in query.
+  size_t query_tokens,
+  size_t key_value_heads,
+  // Number of tokens in key/value. For self-attention, this is same as tokens.
+  size_t key_value_tokens,
+  size_t query_key_channels,
+  size_t value_channels,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+// Query is of dimension [batch_size, query_heads, query_tokens, channels].
+// Key and value are of dimension [batch_size, key_value_heads, key_value_tokens, channels].
+// Scale is of dimension [channels].
+// Mask is of dimension [query_tokens, key_value_tokens].
+enum xnn_status xnn_setup_scaled_dot_product_attention_nhtc_f16(
+  xnn_operator_t attention_op,
+  void* workspace,
+  const void* query,
+  const void* key,
+  const void* value,
+  const void* scale,
+  const void* mask,
+  void* output);
+
+// N: batch size
+// H: number of heads
+// T: tokens (sequence length)
+// C: channels (head dimension)
+enum xnn_status xnn_create_scaled_dot_product_attention_nhtc_f32(
+  enum xnn_attention_logits_cap_type cap_type,
+  const void* cap_params,
+  uint32_t flags,
+  xnn_operator_t* attention_op_out);
+
+enum xnn_status xnn_reshape_scaled_dot_product_attention_nhtc_f32(
+  xnn_operator_t attention_op,
+  size_t batch_size,
+  size_t query_heads,
+  // Number of tokens in query.
+  size_t query_tokens,
+  size_t key_value_heads,
+  // Number of tokens in key/value. For self-attention, this is same as tokens.
+  size_t key_value_tokens,
+  size_t query_key_channels,
+  size_t value_channels,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+// Query is of dimension [batch_size, query_heads, query_tokens, query_key_channels].
+// Key and value are of dimension [batch_size, key_value_heads, key_value_tokens, query_key_channels].
+// Scale is of dimension [query_key_channels].
+// Mask is of dimension [query_tokens, key_value_tokens].
+// Output is of dimension [batch_size, query_heads, query_tokens, value_channels].
+enum xnn_status xnn_setup_scaled_dot_product_attention_nhtc_f32(
+  xnn_operator_t attention_op,
+  void* workspace,
+  const float* query,
+  const float* key,
+  const float* value,
+  const float* scale,
+  const float* mask,
+  float* output);
+
+enum xnn_status xnn_create_sigmoid_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* sigmoid_op_out);
+
+enum xnn_status xnn_reshape_sigmoid_nc_f16(
+  xnn_operator_t sigmoid_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_sigmoid_nc_f16(
+  xnn_operator_t sigmoid_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_sigmoid_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* sigmoid_op_out);
+
+enum xnn_status xnn_reshape_sigmoid_nc_f32(
+  xnn_operator_t sigmoid_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_sigmoid_nc_f32(
+  xnn_operator_t sigmoid_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_sigmoid_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_sigmoid_nc_qs8(
+  int8_t input_zero_point,
+  float input_scale,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* sigmoid_op_out);
+
+enum xnn_status xnn_reshape_sigmoid_nc_qs8(
+  xnn_operator_t sigmoid_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_sigmoid_nc_qs8(
+  xnn_operator_t sigmoid_op,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_sigmoid_nc_qu8(
+  uint8_t input_zero_point,
+  float input_scale,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* sigmoid_op_out);
+
+enum xnn_status xnn_reshape_sigmoid_nc_qu8(
+  xnn_operator_t sigmoid_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_sigmoid_nc_qu8(
+  xnn_operator_t sigmoid_op,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_slice_nd_x16(
+  uint32_t flags,
+  xnn_operator_t* slice_op_out);
+
+enum xnn_status xnn_reshape_slice_nd_x16(
+  xnn_operator_t slice_op,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* offsets,
+  const size_t* sizes,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_slice_nd_x16(
+  xnn_operator_t slice_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_slice_nd_x32(
+  uint32_t flags,
+  xnn_operator_t* slice_op_out);
+
+enum xnn_status xnn_reshape_slice_nd_x32(
+  xnn_operator_t slice_op,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* offsets,
+  const size_t* sizes,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_slice_nd_x32(
+  xnn_operator_t slice_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_run_slice_nd_x32(
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* offsets,
+  const size_t* sizes,
+  const void* input,
+  void* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_softmax_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* softmax_op_out);
+
+enum xnn_status xnn_reshape_softmax_nc_f16(
+  xnn_operator_t softmax_op,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_softmax_nc_f16(
+  xnn_operator_t softmax_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_softmax_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* softmax_op_out);
+
+enum xnn_status xnn_reshape_softmax_nc_f32(
+  xnn_operator_t softmax_op,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_softmax_nc_f32(
+  xnn_operator_t softmax_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_softmax_nc_qu8(
+  float input_scale,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint32_t flags,
+  xnn_operator_t* softmax_op_out);
+
+enum xnn_status xnn_reshape_softmax_nc_qu8(
+  xnn_operator_t softmax_op,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_softmax_nc_qu8(
+  xnn_operator_t softmax_op,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_space_to_depth_nhwc_x16(
+  uint32_t block_size,
+  uint32_t flags,
+  xnn_operator_t* space_to_depth_op_out);
+
+enum xnn_status xnn_reshape_space_to_depth_nhwc_x16(
+  xnn_operator_t space_to_depth_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t input_channels,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  size_t* output_channels_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_space_to_depth_nhwc_x16(
+  xnn_operator_t space_to_depth_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_space_to_depth_nhwc_x32(
+  uint32_t block_size,
+  uint32_t flags,
+  xnn_operator_t* space_to_depth_op_out);
+
+enum xnn_status xnn_reshape_space_to_depth_nhwc_x32(
+  xnn_operator_t space_to_depth_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t input_channels,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  size_t* output_channels_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_space_to_depth_nhwc_x32(
+  xnn_operator_t space_to_depth_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_square_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* square_op_out);
+
+enum xnn_status xnn_reshape_square_nc_f16(
+  xnn_operator_t square_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_square_nc_f16(
+  xnn_operator_t square_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_square_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* square_op_out);
+
+enum xnn_status xnn_reshape_square_nc_f32(
+  xnn_operator_t square_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_square_nc_f32(
+  xnn_operator_t square_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_square_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_square_root_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* sqrt_op_out);
+
+enum xnn_status xnn_reshape_square_root_nc_f16(
+  xnn_operator_t sqrt_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_square_root_nc_f16(
+  xnn_operator_t sqrt_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_square_root_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* sqrt_op_out);
+
+enum xnn_status xnn_reshape_square_root_nc_f32(
+  xnn_operator_t sqrt_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_square_root_nc_f32(
+  xnn_operator_t sqrt_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_square_root_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_reciprocal_square_root_nc_f32(
+    uint32_t flags, xnn_operator_t* sqrt_op_out);
+
+enum xnn_status xnn_reshape_reciprocal_square_root_nc_f32(
+    xnn_operator_t sqrt_op, size_t batch_size, size_t channels,
+    size_t input_stride, size_t output_stride, pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_reciprocal_square_root_nc_f32(xnn_operator_t sqrt_op,
+                                                        const float* input,
+                                                        float* output);
+
+enum xnn_status xnn_run_reciprocal_square_root_nc_f32(
+    size_t channels, size_t input_stride, size_t output_stride,
+    size_t batch_size, const float* input, float* output, uint32_t flags,
+    pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_squared_difference_nd_f16(
+  uint32_t flags,
+  xnn_operator_t* squared_difference_op_out);
+
+enum xnn_status xnn_reshape_squared_difference_nd_f16(
+  xnn_operator_t squared_difference_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_squared_difference_nd_f16(
+  xnn_operator_t squared_difference_op,
+  const void* input1,
+  const void* input2,
+  void* output);
+
+enum xnn_status xnn_create_squared_difference_nd_f32(
+  uint32_t flags,
+  xnn_operator_t* squared_difference_op_out);
+
+enum xnn_status xnn_reshape_squared_difference_nd_f32(
+  xnn_operator_t squared_difference_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_squared_difference_nd_f32(
+  xnn_operator_t squared_difference_op,
+  const float* input1,
+  const float* input2,
+  float* output);
+
+enum xnn_status xnn_run_squared_difference_nd_f32(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  const float* input1,
+  const float* input2,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_subtract_nd_f16(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* subtract_op_out);
+
+enum xnn_status xnn_reshape_subtract_nd_f16(
+  xnn_operator_t subtract_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_subtract_nd_f16(
+  xnn_operator_t subtract_op,
+  const void* input1,
+  const void* input2,
+  void* output);
+
+enum xnn_status xnn_create_subtract_nd_f32(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* subtract_op_out);
+
+enum xnn_status xnn_reshape_subtract_nd_f32(
+  xnn_operator_t subtract_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_subtract_nd_f32(
+  xnn_operator_t subtract_op,
+  const float* input1,
+  const float* input2,
+  float* output);
+
+enum xnn_status xnn_run_subtract_nd_f32(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  const float* input1,
+  const float* input2,
+  float* output,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_subtract_nd_qs8(
+  int8_t input1_zero_point,
+  float input1_scale,
+  int8_t input2_zero_point,
+  float input2_scale,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* subtract_op_out);
+
+enum xnn_status xnn_reshape_subtract_nd_qs8(
+  xnn_operator_t subtract_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_subtract_nd_qs8(
+  xnn_operator_t subtract_op,
+  const int8_t* input1,
+  const int8_t* input2,
+  int8_t* output);
+
+enum xnn_status xnn_run_subtract_nd_qs8(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  int8_t input1_zero_point,
+  float input1_scale,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  int8_t input2_zero_point,
+  float input2_scale,
+  const int8_t* input1,
+  const int8_t* input2,
+  int8_t* output,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_subtract_nd_qu8(
+  uint8_t input1_zero_point,
+  float input1_scale,
+  uint8_t input2_zero_point,
+  float input2_scale,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* subtract_op_out);
+
+enum xnn_status xnn_reshape_subtract_nd_qu8(
+  xnn_operator_t subtract_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_subtract_nd_qu8(
+  xnn_operator_t subtract_op,
+  const uint8_t* input1,
+  const uint8_t* input2,
+  uint8_t* output);
+
+enum xnn_status xnn_run_subtract_nd_qu8(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  uint8_t input1_zero_point,
+  float input1_scale,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  uint8_t input2_zero_point,
+  float input2_scale,
+  const uint8_t* input1,
+  const uint8_t* input2,
+  uint8_t* output,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_tanh_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* tanh_op_out);
+
+enum xnn_status xnn_reshape_tanh_nc_f16(
+  xnn_operator_t tanh_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_tanh_nc_f16(
+  xnn_operator_t tanh_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_tanh_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* tanh_op_out);
+
+enum xnn_status xnn_reshape_tanh_nc_f32(
+  xnn_operator_t tanh_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_tanh_nc_f32(
+  xnn_operator_t tanh_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_tanh_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_tanh_nc_qs8(
+  int8_t input_zero_point,
+  float input_scale,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* tanh_op_out);
+
+enum xnn_status xnn_reshape_tanh_nc_qs8(
+  xnn_operator_t tanh_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_tanh_nc_qs8(
+  xnn_operator_t tanh_op,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_tanh_nc_qu8(
+  uint8_t input_zero_point,
+  float input_scale,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* tanh_op_out);
+
+enum xnn_status xnn_reshape_tanh_nc_qu8(
+  xnn_operator_t tanh_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_tanh_nc_qu8(
+  xnn_operator_t tanh_op,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_transpose_nd_x8(
+  uint32_t flags,
+  xnn_operator_t* transpose_op_out);
+
+enum xnn_status xnn_reshape_transpose_nd_x8(
+  xnn_operator_t transpose_op,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* output_perm,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_transpose_nd_x8(
+  xnn_operator_t transpose_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_run_transpose_nd_x8(
+  const void* input,
+  void* output,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* output_perm,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_transpose_nd_x16(
+  uint32_t flags,
+  xnn_operator_t* transpose_op_out);
+
+enum xnn_status xnn_reshape_transpose_nd_x16(
+  xnn_operator_t transpose_op,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* output_perm,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_transpose_nd_x16(
+  xnn_operator_t transpose_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_run_transpose_nd_x16(
+  const void* input,
+  void* output,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* output_perm,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_transpose_nd_x32(
+  uint32_t flags,
+  xnn_operator_t* transpose_op_out);
+
+enum xnn_status xnn_reshape_transpose_nd_x32(
+  xnn_operator_t transpose_op,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* output_perm,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_transpose_nd_x32(
+  xnn_operator_t transpose_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_run_transpose_nd_x32(
+  const void* input,
+  void* output,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* output_perm,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_transpose_nd_x64(
+  uint32_t flags,
+  xnn_operator_t* transpose_op_out);
+
+enum xnn_status xnn_reshape_transpose_nd_x64(
+  xnn_operator_t transpose_op,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* output_perm,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_transpose_nd_x64(
+  xnn_operator_t transpose_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_run_transpose_nd_x64(
+  const void* input,
+  void* output,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* output_perm,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_truncation_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* truncation_op_out);
+
+enum xnn_status xnn_reshape_truncation_nc_f16(
+  xnn_operator_t truncation_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_truncation_nc_f16(
+  xnn_operator_t truncation_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_truncation_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* truncation_op_out);
+
+enum xnn_status xnn_reshape_truncation_nc_f32(
+  xnn_operator_t truncation_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_truncation_nc_f32(
+  xnn_operator_t truncation_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_truncation_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_unpooling2d_nhwc_x32(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  uint32_t flags,
+  xnn_operator_t* unpooling_op_out);
+
+enum xnn_status xnn_reshape_unpooling2d_nhwc_x32(
+  xnn_operator_t unpooling_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_unpooling2d_nhwc_x32(
+  xnn_operator_t unpooling_op,
+  const void* input,
+  const uint32_t* index,
+  void* output);
+
+enum xnn_status xnn_create_slice_nd_x8(
+  uint32_t flags,
+  xnn_operator_t* slice_op_out);
+
+enum xnn_status xnn_reshape_slice_nd_x8(
+  xnn_operator_t slice_op,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* offsets,
+  const size_t* sizes,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_slice_nd_x8(
+  xnn_operator_t slice_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_space_to_depth_nhwc_x8(
+  uint32_t block_size,
+  uint32_t flags,
+  xnn_operator_t* space_to_depth_op_out);
+
+enum xnn_status xnn_reshape_space_to_depth_nhwc_x8(
+  xnn_operator_t space_to_depth_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t input_channels,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  size_t* output_channels_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_space_to_depth_nhwc_x8(
+  xnn_operator_t space_to_depth_op,
+  const void* input,
+  void* output);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif