# Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

import unittest

import numpy as np

import warp as wp
from warp.tests.unittest_utils import *

wp.init()

np_signed_int_types = [
    np.int8,
    np.int16,
    np.int32,
    np.int64,
    np.byte,
]

np_unsigned_int_types = [
    np.uint8,
    np.uint16,
    np.uint32,
    np.uint64,
    np.ubyte,
]

np_int_types = np_signed_int_types + np_unsigned_int_types

np_float_types = [np.float16, np.float32, np.float64]

np_scalar_types = np_int_types + np_float_types


def randvals(rng, shape, dtype):
    if dtype in np_float_types:
        return rng.standard_normal(size=shape).astype(dtype)
    elif dtype in [np.int8, np.uint8, np.byte, np.ubyte]:
        return rng.integers(1, high=3, size=shape, dtype=dtype)
    return rng.integers(1, high=5, size=shape, dtype=dtype)


kernel_cache = dict()


def getkernel(func, suffix=""):
    key = func.__name__ + "_" + suffix
    if key not in kernel_cache:
        kernel_cache[key] = wp.Kernel(func=func, key=key)
    return kernel_cache[key]


def get_select_kernel(dtype):
    def output_select_kernel_fn(
        input: wp.array(dtype=dtype),
        index: int,
        out: wp.array(dtype=dtype),
    ):
        out[0] = input[index]

    return getkernel(output_select_kernel_fn, suffix=dtype.__name__)


def get_select_kernel2(dtype):
    def output_select_kernel2_fn(
        input: wp.array(dtype=dtype, ndim=2),
        index0: int,
        index1: int,
        out: wp.array(dtype=dtype),
    ):
        out[0] = input[index0, index1]

    return getkernel(output_select_kernel2_fn, suffix=dtype.__name__)


def test_arrays(test, device, dtype):
    rng = np.random.default_rng(123)

    tol = {
        np.float16: 1.0e-3,
        np.float32: 1.0e-6,
        np.float64: 1.0e-8,
    }.get(dtype, 0)

    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
    arr_np = randvals(rng, (10, 5), dtype)
    arr = wp.array(arr_np, dtype=wptype, requires_grad=True, device=device)

    assert_np_equal(arr.numpy(), arr_np, tol=tol)


def test_unary_ops(test, device, dtype, register_kernels=False):
    rng = np.random.default_rng(123)

    tol = {
        np.float16: 5.0e-3,
        np.float32: 1.0e-6,
        np.float64: 1.0e-8,
    }.get(dtype, 0)

    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]

    def check_unary(
        inputs: wp.array(dtype=wptype, ndim=2),
        outputs: wp.array(dtype=wptype, ndim=2),
    ):
        for i in range(10):
            i0 = inputs[0, i]
            i1 = inputs[1, i]
            i2 = inputs[2, i]
            i3 = inputs[3, i]
            i4 = inputs[4, i]

            # multiply outputs by 2 so we've got something to backpropagate:
            outputs[0, i] = wptype(2.0) * (+i0)
            outputs[1, i] = wptype(2.0) * (-i1)
            outputs[2, i] = wptype(2.0) * wp.sign(i2)
            outputs[3, i] = wptype(2.0) * wp.abs(i3)
            outputs[4, i] = wptype(2.0) * wp.step(i4)

    kernel = getkernel(check_unary, suffix=dtype.__name__)
    output_select_kernel = get_select_kernel2(wptype)

    if register_kernels:
        return

    if dtype in np_float_types:
        inputs = wp.array(
            rng.standard_normal(size=(5, 10)).astype(dtype), dtype=wptype, requires_grad=True, device=device
        )
    else:
        inputs = wp.array(
            rng.integers(-2, high=3, size=(5, 10), dtype=dtype), dtype=wptype, requires_grad=True, device=device
        )
    outputs = wp.zeros_like(inputs)

    wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
    assert_np_equal(outputs.numpy()[0], 2 * inputs.numpy()[0], tol=tol)
    assert_np_equal(outputs.numpy()[1], -2 * inputs.numpy()[1], tol=tol)
    expected = 2 * np.sign(inputs.numpy()[2])
    expected[expected == 0] = 2
    assert_np_equal(outputs.numpy()[2], expected, tol=tol)
    assert_np_equal(outputs.numpy()[3], 2 * np.abs(inputs.numpy()[3]), tol=tol)
    assert_np_equal(outputs.numpy()[4], 2 * (1 - np.heaviside(inputs.numpy()[4], 1)), tol=tol)

    out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
    if dtype in np_float_types:
        for i in range(10):
            # grad of 2x:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 0, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected_grads = np.zeros_like(inputs.numpy())
            expected_grads[0, i] = 2
            assert_np_equal(tape.gradients[inputs].numpy(), expected_grads, tol=tol)
            tape.zero()

            # grad of -2x:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 1, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected_grads = np.zeros_like(inputs.numpy())
            expected_grads[1, i] = -2
            assert_np_equal(tape.gradients[inputs].numpy(), expected_grads, tol=tol)
            tape.zero()

            # grad of 2 * sign(x):
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 2, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected_grads = np.zeros_like(inputs.numpy())
            assert_np_equal(tape.gradients[inputs].numpy(), expected_grads, tol=tol)
            tape.zero()

            # grad of 2 * abs(x):
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 3, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected_grads = np.zeros_like(inputs.numpy())
            expected_grads[3, i] = 2 * np.sign(inputs.numpy()[3, i])
            assert_np_equal(tape.gradients[inputs].numpy(), expected_grads, tol=tol)
            tape.zero()

            # grad of 2 * step(x):
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 4, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected_grads = np.zeros_like(inputs.numpy())
            assert_np_equal(tape.gradients[inputs].numpy(), expected_grads, tol=tol)
            tape.zero()


def test_nonzero(test, device, dtype, register_kernels=False):
    rng = np.random.default_rng(123)

    tol = {
        np.float16: 5.0e-3,
        np.float32: 1.0e-6,
        np.float64: 1.0e-8,
    }.get(dtype, 0)

    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]

    def check_nonzero(
        inputs: wp.array(dtype=wptype),
        outputs: wp.array(dtype=wptype),
    ):
        for i in range(10):
            i0 = inputs[i]
            outputs[i] = wp.nonzero(i0)

    kernel = getkernel(check_nonzero, suffix=dtype.__name__)
    output_select_kernel = get_select_kernel(wptype)

    if register_kernels:
        return

    inputs = wp.array(rng.integers(-2, high=3, size=10).astype(dtype), dtype=wptype, requires_grad=True, device=device)
    outputs = wp.zeros_like(inputs)

    wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
    assert_np_equal(outputs.numpy(), (inputs.numpy() != 0))

    out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
    if dtype in np_float_types:
        for i in range(10):
            # grad should just be zero:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected_grads = np.zeros_like(inputs.numpy())
            assert_np_equal(tape.gradients[inputs].numpy(), expected_grads, tol=tol)
            tape.zero()


def test_binary_ops(test, device, dtype, register_kernels=False):
    rng = np.random.default_rng(123)

    tol = {
        np.float16: 5.0e-2,
        np.float32: 1.0e-6,
        np.float64: 1.0e-8,
    }.get(dtype, 0)

    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]

    def check_binary_ops(
        in1: wp.array(dtype=wptype, ndim=2),
        in2: wp.array(dtype=wptype, ndim=2),
        outputs: wp.array(dtype=wptype, ndim=2),
    ):
        for i in range(10):
            i0 = in1[0, i]
            i1 = in1[1, i]
            i2 = in1[2, i]
            i3 = in1[3, i]
            i4 = in1[4, i]
            i5 = in1[5, i]
            i6 = in1[6, i]
            i7 = in1[7, i]

            j0 = in2[0, i]
            j1 = in2[1, i]
            j2 = in2[2, i]
            j3 = in2[3, i]
            j4 = in2[4, i]
            j5 = in2[5, i]
            j6 = in2[6, i]
            j7 = in2[7, i]

            outputs[0, i] = wptype(2) * wp.mul(i0, j0)
            outputs[1, i] = wptype(2) * wp.div(i1, j1)
            outputs[2, i] = wptype(2) * wp.add(i2, j2)
            outputs[3, i] = wptype(2) * wp.sub(i3, j3)
            outputs[4, i] = wptype(2) * wp.mod(i4, j4)
            outputs[5, i] = wptype(2) * wp.min(i5, j5)
            outputs[6, i] = wptype(2) * wp.max(i6, j6)
            outputs[7, i] = wptype(2) * wp.floordiv(i7, j7)

    kernel = getkernel(check_binary_ops, suffix=dtype.__name__)
    output_select_kernel = get_select_kernel2(wptype)

    if register_kernels:
        return

    vals1 = randvals(rng, [8, 10], dtype)
    if dtype in [np_unsigned_int_types]:
        vals2 = vals1 + randvals(rng, [8, 10], dtype)
    else:
        vals2 = np.abs(randvals(rng, [8, 10], dtype))

    in1 = wp.array(vals1, dtype=wptype, requires_grad=True, device=device)
    in2 = wp.array(vals2, dtype=wptype, requires_grad=True, device=device)

    outputs = wp.zeros_like(in1)

    wp.launch(kernel, dim=1, inputs=[in1, in2], outputs=[outputs], device=device)

    assert_np_equal(outputs.numpy()[0], 2 * in1.numpy()[0] * in2.numpy()[0], tol=tol)
    if dtype in np_float_types:
        assert_np_equal(outputs.numpy()[1], 2 * in1.numpy()[1] / (in2.numpy()[1]), tol=tol)
    else:
        assert_np_equal(outputs.numpy()[1], 2 * (in1.numpy()[1] // (in2.numpy()[1])), tol=tol)
    assert_np_equal(outputs.numpy()[2], 2 * (in1.numpy()[2] + (in2.numpy()[2])), tol=tol)
    assert_np_equal(outputs.numpy()[3], 2 * (in1.numpy()[3] - (in2.numpy()[3])), tol=tol)

    # ...so this is actually the desired behaviour right? Looks like wp.mod doesn't behave like
    # python's % operator or np.mod()...
    assert_np_equal(
        outputs.numpy()[4],
        2
        * (
            (in1.numpy()[4])
            - (in2.numpy()[4]) * np.sign(in1.numpy()[4]) * np.floor(np.abs(in1.numpy()[4]) / (in2.numpy()[4]))
        ),
        tol=tol,
    )

    assert_np_equal(outputs.numpy()[5], 2 * np.minimum(in1.numpy()[5], in2.numpy()[5]), tol=tol)
    assert_np_equal(outputs.numpy()[6], 2 * np.maximum(in1.numpy()[6], in2.numpy()[6]), tol=tol)
    assert_np_equal(outputs.numpy()[7], 2 * np.floor_divide(in1.numpy()[7], in2.numpy()[7]), tol=tol)

    out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
    if dtype in np_float_types:
        for i in range(10):
            # multiplication:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[in1, in2], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 0, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(in1.numpy())
            expected[0, i] = 2.0 * in2.numpy()[0, i]
            assert_np_equal(tape.gradients[in1].numpy(), expected, tol=tol)
            expected[0, i] = 2.0 * in1.numpy()[0, i]
            assert_np_equal(tape.gradients[in2].numpy(), expected, tol=tol)
            tape.zero()

            # division:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[in1, in2], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 1, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(in1.numpy())
            expected[1, i] = 2.0 / (in2.numpy()[1, i])
            assert_np_equal(tape.gradients[in1].numpy(), expected, tol=tol)
            # y = x1/x2
            # dy/dx2 = -x1/x2^2
            expected[1, i] = (-2.0) * (in1.numpy()[1, i] / (in2.numpy()[1, i] ** 2))
            assert_np_equal(tape.gradients[in2].numpy(), expected, tol=tol)
            tape.zero()

            # addition:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[in1, in2], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 2, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(in1.numpy())
            expected[2, i] = 2.0
            assert_np_equal(tape.gradients[in1].numpy(), expected, tol=tol)
            expected[2, i] = 2.0
            assert_np_equal(tape.gradients[in2].numpy(), expected, tol=tol)
            tape.zero()

            # subtraction:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[in1, in2], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 3, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(in1.numpy())
            expected[3, i] = 2.0
            assert_np_equal(tape.gradients[in1].numpy(), expected, tol=tol)
            expected[3, i] = -2.0
            assert_np_equal(tape.gradients[in2].numpy(), expected, tol=tol)
            tape.zero()

            # modulus. unless at discontinuities,
            # d/dx1( x1 % x2 ) == 1
            # d/dx2( x1 % x2 ) == 0
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[in1, in2], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 4, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(in1.numpy())
            expected[4, i] = 2.0
            assert_np_equal(tape.gradients[in1].numpy(), expected, tol=tol)
            expected[4, i] = 0.0
            assert_np_equal(tape.gradients[in2].numpy(), expected, tol=tol)
            tape.zero()

            # min
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[in1, in2], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 5, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(in1.numpy())
            expected[5, i] = 2.0 if (in1.numpy()[5, i] < in2.numpy()[5, i]) else 0.0
            assert_np_equal(tape.gradients[in1].numpy(), expected, tol=tol)
            expected[5, i] = 2.0 if (in2.numpy()[5, i] < in1.numpy()[5, i]) else 0.0
            assert_np_equal(tape.gradients[in2].numpy(), expected, tol=tol)
            tape.zero()

            # max
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[in1, in2], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 6, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(in1.numpy())
            expected[6, i] = 2.0 if (in1.numpy()[6, i] > in2.numpy()[6, i]) else 0.0
            assert_np_equal(tape.gradients[in1].numpy(), expected, tol=tol)
            expected[6, i] = 2.0 if (in2.numpy()[6, i] > in1.numpy()[6, i]) else 0.0
            assert_np_equal(tape.gradients[in2].numpy(), expected, tol=tol)
            tape.zero()

            # floor_divide. Returns integers so gradient is zero
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[in1, in2], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 7, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(in1.numpy())
            assert_np_equal(tape.gradients[in1].numpy(), expected, tol=tol)
            assert_np_equal(tape.gradients[in2].numpy(), expected, tol=tol)
            tape.zero()


def test_special_funcs(test, device, dtype, register_kernels=False):
    rng = np.random.default_rng(123)

    tol = {
        np.float16: 1.0e-2,
        np.float32: 1.0e-6,
        np.float64: 1.0e-8,
    }.get(dtype, 0)

    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]

    def check_special_funcs(
        inputs: wp.array(dtype=wptype, ndim=2),
        outputs: wp.array(dtype=wptype, ndim=2),
    ):
        # multiply outputs by 2 so we've got something to backpropagate:
        for i in range(10):
            outputs[0, i] = wptype(2) * wp.log(inputs[0, i])
            outputs[1, i] = wptype(2) * wp.log2(inputs[1, i])
            outputs[2, i] = wptype(2) * wp.log10(inputs[2, i])
            outputs[3, i] = wptype(2) * wp.exp(inputs[3, i])
            outputs[4, i] = wptype(2) * wp.atan(inputs[4, i])
            outputs[5, i] = wptype(2) * wp.sin(inputs[5, i])
            outputs[6, i] = wptype(2) * wp.cos(inputs[6, i])
            outputs[7, i] = wptype(2) * wp.sqrt(inputs[7, i])
            outputs[8, i] = wptype(2) * wp.tan(inputs[8, i])
            outputs[9, i] = wptype(2) * wp.sinh(inputs[9, i])
            outputs[10, i] = wptype(2) * wp.cosh(inputs[10, i])
            outputs[11, i] = wptype(2) * wp.tanh(inputs[11, i])
            outputs[12, i] = wptype(2) * wp.acos(inputs[12, i])
            outputs[13, i] = wptype(2) * wp.asin(inputs[13, i])
            outputs[14, i] = wptype(2) * wp.cbrt(inputs[14, i])

    kernel = getkernel(check_special_funcs, suffix=dtype.__name__)
    output_select_kernel = get_select_kernel2(wptype)

    if register_kernels:
        return

    invals = rng.normal(size=(15, 10)).astype(dtype)
    invals[[0, 1, 2, 7, 14]] = 0.1 + np.abs(invals[[0, 1, 2, 7, 14]])
    invals[12] = np.clip(invals[12], -0.9, 0.9)
    invals[13] = np.clip(invals[13], -0.9, 0.9)
    inputs = wp.array(invals, dtype=wptype, requires_grad=True, device=device)
    outputs = wp.zeros_like(inputs)

    wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)

    assert_np_equal(outputs.numpy()[0], 2 * np.log(inputs.numpy()[0]), tol=tol)
    assert_np_equal(outputs.numpy()[1], 2 * np.log2(inputs.numpy()[1]), tol=tol)
    assert_np_equal(outputs.numpy()[2], 2 * np.log10(inputs.numpy()[2]), tol=tol)
    assert_np_equal(outputs.numpy()[3], 2 * np.exp(inputs.numpy()[3]), tol=tol)
    assert_np_equal(outputs.numpy()[4], 2 * np.arctan(inputs.numpy()[4]), tol=tol)
    assert_np_equal(outputs.numpy()[5], 2 * np.sin(inputs.numpy()[5]), tol=tol)
    assert_np_equal(outputs.numpy()[6], 2 * np.cos(inputs.numpy()[6]), tol=tol)
    assert_np_equal(outputs.numpy()[7], 2 * np.sqrt(inputs.numpy()[7]), tol=tol)
    assert_np_equal(outputs.numpy()[8], 2 * np.tan(inputs.numpy()[8]), tol=tol)
    assert_np_equal(outputs.numpy()[9], 2 * np.sinh(inputs.numpy()[9]), tol=tol)
    assert_np_equal(outputs.numpy()[10], 2 * np.cosh(inputs.numpy()[10]), tol=tol)
    assert_np_equal(outputs.numpy()[11], 2 * np.tanh(inputs.numpy()[11]), tol=tol)
    assert_np_equal(outputs.numpy()[12], 2 * np.arccos(inputs.numpy()[12]), tol=tol)
    assert_np_equal(outputs.numpy()[13], 2 * np.arcsin(inputs.numpy()[13]), tol=tol)
    assert_np_equal(outputs.numpy()[14], 2 * np.cbrt(inputs.numpy()[14]), tol=tol)

    out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
    if dtype in np_float_types:
        for i in range(10):
            # log:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 0, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(inputs.numpy())
            expected[0, i] = 2.0 / inputs.numpy()[0, i]
            assert_np_equal(tape.gradients[inputs].numpy(), expected, tol=tol)
            tape.zero()

            # log2:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 1, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(inputs.numpy())
            expected[1, i] = 2.0 / (inputs.numpy()[1, i] * np.log(2.0))
            assert_np_equal(tape.gradients[inputs].numpy(), expected, tol=tol)
            tape.zero()

            # log10:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 2, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(inputs.numpy())
            expected[2, i] = 2.0 / (inputs.numpy()[2, i] * np.log(10.0))
            assert_np_equal(tape.gradients[inputs].numpy(), expected, tol=tol)
            tape.zero()

            # exp:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 3, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(inputs.numpy())
            expected[3, i] = outputs.numpy()[3, i]
            assert_np_equal(tape.gradients[inputs].numpy(), expected, tol=tol)
            tape.zero()

            # arctan:
            # looks like the autodiff formula in warp was wrong? Was (1 + x^2) rather than
            # 1/(1 + x^2)
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 4, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(inputs.numpy())
            expected[4, i] = 2.0 / (inputs.numpy()[4, i] ** 2 + 1)
            assert_np_equal(tape.gradients[inputs].numpy(), expected, tol=tol)
            tape.zero()

            # sin:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 5, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(inputs.numpy())
            expected[5, i] = np.cos(inputs.numpy()[5, i]) * 2
            assert_np_equal(tape.gradients[inputs].numpy(), expected, tol=tol)
            tape.zero()

            # cos:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 6, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(inputs.numpy())
            expected[6, i] = -np.sin(inputs.numpy()[6, i]) * 2.0
            assert_np_equal(tape.gradients[inputs].numpy(), expected, tol=tol)
            tape.zero()

            # sqrt:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 7, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(inputs.numpy())
            expected[7, i] = 1.0 / (np.sqrt(inputs.numpy()[7, i]))
            assert_np_equal(tape.gradients[inputs].numpy(), expected, tol=tol)
            tape.zero()

            # tan:
            # looks like there was a bug in autodiff formula here too - gradient was zero if cos(x) > 0
            # (should have been "if(cosx != 0)")
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 8, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(inputs.numpy())
            expected[8, i] = 2.0 / (np.cos(inputs.numpy()[8, i]) ** 2)
            assert_np_equal(tape.gradients[inputs].numpy(), expected, tol=200 * tol)
            tape.zero()

            # sinh:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 9, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(inputs.numpy())
            expected[9, i] = 2.0 * np.cosh(inputs.numpy()[9, i])
            assert_np_equal(tape.gradients[inputs].numpy(), expected, tol=tol)
            tape.zero()

            # cosh:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 10, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(inputs.numpy())
            expected[10, i] = 2.0 * np.sinh(inputs.numpy()[10, i])
            assert_np_equal(tape.gradients[inputs].numpy(), expected, tol=tol)
            tape.zero()

            # tanh:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 11, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(inputs.numpy())
            expected[11, i] = 2.0 / (np.cosh(inputs.numpy()[11, i]) ** 2)
            assert_np_equal(tape.gradients[inputs].numpy(), expected, tol=tol)
            tape.zero()

            # arccos:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 12, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(inputs.numpy())
            expected[12, i] = -2.0 / np.sqrt(1 - inputs.numpy()[12, i] ** 2)
            assert_np_equal(tape.gradients[inputs].numpy(), expected, tol=tol)
            tape.zero()

            # arcsin:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 13, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(inputs.numpy())
            expected[13, i] = 2.0 / np.sqrt(1 - inputs.numpy()[13, i] ** 2)
            assert_np_equal(tape.gradients[inputs].numpy(), expected, tol=6 * tol)
            tape.zero()

            # cbrt:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 14, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(inputs.numpy())
            cbrt = np.cbrt(inputs.numpy()[14, i], dtype=np.dtype(dtype))
            expected[14, i] = (2.0 / 3.0) * (1.0 / (cbrt * cbrt))
            assert_np_equal(tape.gradients[inputs].numpy(), expected, tol=tol)
            tape.zero()


def test_special_funcs_2arg(test, device, dtype, register_kernels=False):
    rng = np.random.default_rng(123)

    tol = {
        np.float16: 1.0e-2,
        np.float32: 1.0e-6,
        np.float64: 1.0e-8,
    }.get(dtype, 0)

    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]

    def check_special_funcs_2arg(
        in1: wp.array(dtype=wptype, ndim=2),
        in2: wp.array(dtype=wptype, ndim=2),
        outputs: wp.array(dtype=wptype, ndim=2),
    ):
        # multiply outputs by 2 so we've got something to backpropagate:
        for i in range(10):
            outputs[0, i] = wptype(2) * wp.pow(in1[0, i], in2[0, i])
            outputs[1, i] = wptype(2) * wp.atan2(in1[1, i], in2[1, i])

    kernel = getkernel(check_special_funcs_2arg, suffix=dtype.__name__)
    output_select_kernel = get_select_kernel2(wptype)

    if register_kernels:
        return

    in1 = wp.array(np.abs(randvals(rng, [2, 10], dtype)), dtype=wptype, requires_grad=True, device=device)
    in2 = wp.array(randvals(rng, [2, 10], dtype), dtype=wptype, requires_grad=True, device=device)
    outputs = wp.zeros_like(in1)

    wp.launch(kernel, dim=1, inputs=[in1, in2], outputs=[outputs], device=device)

    assert_np_equal(outputs.numpy()[0], 2.0 * np.power(in1.numpy()[0], in2.numpy()[0]), tol=tol)
    assert_np_equal(outputs.numpy()[1], 2.0 * np.arctan2(in1.numpy()[1], in2.numpy()[1]), tol=tol)

    out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
    if dtype in np_float_types:
        for i in range(10):
            # pow:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[in1, in2], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 0, i], outputs=[out], device=device)
            tape.backward(loss=out)
            expected = np.zeros_like(in1.numpy())
            expected[0, i] = 2.0 * in2.numpy()[0, i] * np.power(in1.numpy()[0, i], in2.numpy()[0, i] - 1)
            assert_np_equal(tape.gradients[in1].numpy(), expected, tol=5 * tol)
            expected[0, i] = 2.0 * np.power(in1.numpy()[0, i], in2.numpy()[0, i]) * np.log(in1.numpy()[0, i])
            assert_np_equal(tape.gradients[in2].numpy(), expected, tol=tol)
            tape.zero()

            # atan2:
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[in1, in2], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 1, i], outputs=[out], device=device)

            tape.backward(loss=out)
            expected = np.zeros_like(in1.numpy())
            expected[1, i] = 2.0 * in2.numpy()[1, i] / (in1.numpy()[1, i] ** 2 + in2.numpy()[1, i] ** 2)
            assert_np_equal(tape.gradients[in1].numpy(), expected, tol=tol)
            expected[1, i] = -2.0 * in1.numpy()[1, i] / (in1.numpy()[1, i] ** 2 + in2.numpy()[1, i] ** 2)
            assert_np_equal(tape.gradients[in2].numpy(), expected, tol=tol)
            tape.zero()


def test_float_to_int(test, device, dtype, register_kernels=False):
    rng = np.random.default_rng(123)

    tol = {
        np.float16: 5.0e-3,
        np.float32: 1.0e-6,
        np.float64: 1.0e-8,
    }.get(dtype, 0)

    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]

    def check_float_to_int(
        inputs: wp.array(dtype=wptype, ndim=2),
        outputs: wp.array(dtype=wptype, ndim=2),
    ):
        for i in range(10):
            outputs[0, i] = wp.round(inputs[0, i])
            outputs[1, i] = wp.rint(inputs[1, i])
            outputs[2, i] = wp.trunc(inputs[2, i])
            outputs[3, i] = wp.floor(inputs[3, i])
            outputs[4, i] = wp.ceil(inputs[4, i])
            outputs[5, i] = wp.frac(inputs[5, i])

    kernel = getkernel(check_float_to_int, suffix=dtype.__name__)
    output_select_kernel = get_select_kernel2(wptype)

    if register_kernels:
        return

    inputs = wp.array(rng.standard_normal(size=(6, 10)).astype(dtype), dtype=wptype, requires_grad=True, device=device)
    outputs = wp.zeros_like(inputs)

    wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)

    assert_np_equal(outputs.numpy()[0], np.round(inputs.numpy()[0]))
    assert_np_equal(outputs.numpy()[1], np.rint(inputs.numpy()[1]))
    assert_np_equal(outputs.numpy()[2], np.trunc(inputs.numpy()[2]))
    assert_np_equal(outputs.numpy()[3], np.floor(inputs.numpy()[3]))
    assert_np_equal(outputs.numpy()[4], np.ceil(inputs.numpy()[4]))
    assert_np_equal(outputs.numpy()[5], np.modf(inputs.numpy()[5])[0])

    # all the gradients should be zero as these functions are piecewise constant:

    out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
    for i in range(10):
        for j in range(5):
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[inputs], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, j, i], outputs=[out], device=device)

            tape.backward(loss=out)
            assert_np_equal(tape.gradients[inputs].numpy(), np.zeros_like(inputs.numpy()), tol=tol)
            tape.zero()


def test_interp(test, device, dtype, register_kernels=False):
    rng = np.random.default_rng(123)

    tol = {
        np.float16: 1.0e-2,
        np.float32: 5.0e-6,
        np.float64: 1.0e-8,
    }.get(dtype, 0)

    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]

    def check_interp(
        in1: wp.array(dtype=wptype, ndim=2),
        in2: wp.array(dtype=wptype, ndim=2),
        in3: wp.array(dtype=wptype, ndim=2),
        outputs: wp.array(dtype=wptype, ndim=2),
    ):
        # multiply outputs by 2 so we've got something to backpropagate:
        for i in range(10):
            outputs[0, i] = wptype(2) * wp.smoothstep(in1[0, i], in2[0, i], in3[0, i])
            outputs[1, i] = wptype(2) * wp.lerp(in1[1, i], in2[1, i], in3[1, i])

    kernel = getkernel(check_interp, suffix=dtype.__name__)
    output_select_kernel = get_select_kernel2(wptype)

    if register_kernels:
        return

    e0 = randvals(rng, [2, 10], dtype)
    e1 = e0 + randvals(rng, [2, 10], dtype) + 0.1
    in1 = wp.array(e0, dtype=wptype, requires_grad=True, device=device)
    in2 = wp.array(e1, dtype=wptype, requires_grad=True, device=device)
    in3 = wp.array(randvals(rng, [2, 10], dtype), dtype=wptype, requires_grad=True, device=device)

    outputs = wp.zeros_like(in1)

    wp.launch(kernel, dim=1, inputs=[in1, in2, in3], outputs=[outputs], device=device)

    edge0 = in1.numpy()[0]
    edge1 = in2.numpy()[0]
    t_smoothstep = in3.numpy()[0]
    x = np.clip((t_smoothstep - edge0) / (edge1 - edge0), 0, 1)
    smoothstep_expected = 2.0 * x * x * (3 - 2 * x)

    assert_np_equal(outputs.numpy()[0], smoothstep_expected, tol=tol)

    a = in1.numpy()[1]
    b = in2.numpy()[1]
    t = in3.numpy()[1]
    assert_np_equal(outputs.numpy()[1], 2.0 * (a * (1 - t) + b * t), tol=tol)

    out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
    if dtype in np_float_types:
        for i in range(10):
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[in1, in2, in3], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 0, i], outputs=[out], device=device)
            tape.backward(loss=out)

            # e0 = in1
            # e1 = in2
            # t = in3

            # x = clamp((t - e0) / (e1 - e0), 0,1)
            # dx/dt = 1 / (e1 - e0) if e0 < t < e1 else 0

            # y = x * x * (3 - 2 * x)

            # y = 3 * x * x - 2 * x * x * x
            # dy/dx = 6 * ( x - x^2 )
            dydx = 6 * x * (1 - x)

            # dy/in1 = dy/dx dx/de0 de0/din1
            dxde0 = (t_smoothstep - edge1) / ((edge1 - edge0) ** 2)
            dxde0[x == 0] = 0
            dxde0[x == 1] = 0

            expected_grads = np.zeros_like(in1.numpy())
            expected_grads[0, i] = 2.0 * dydx[i] * dxde0[i]
            assert_np_equal(tape.gradients[in1].numpy(), expected_grads, tol=tol)

            # dy/in2 = dy/dx dx/de1 de1/din2
            dxde1 = (edge0 - t_smoothstep) / ((edge1 - edge0) ** 2)
            dxde1[x == 0] = 0
            dxde1[x == 1] = 0

            expected_grads = np.zeros_like(in1.numpy())
            expected_grads[0, i] = 2.0 * dydx[i] * dxde1[i]
            assert_np_equal(tape.gradients[in2].numpy(), expected_grads, tol=tol)

            # dy/in3 = dy/dx dx/dt dt/din3
            dxdt = 1.0 / (edge1 - edge0)
            dxdt[x == 0] = 0
            dxdt[x == 1] = 0

            expected_grads = np.zeros_like(in1.numpy())
            expected_grads[0, i] = 2.0 * dydx[i] * dxdt[i]
            assert_np_equal(tape.gradients[in3].numpy(), expected_grads, tol=tol)
            tape.zero()

            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[in1, in2, in3], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, 1, i], outputs=[out], device=device)
            tape.backward(loss=out)

            # y = a*(1-t) + b*t
            # a = in1
            # b = in2
            # t = in3

            # y = in1*( 1 - in3 ) + in2*in3

            # dy/din1 = (1-in3)
            expected_grads = np.zeros_like(in1.numpy())
            expected_grads[1, i] = 2.0 * (1 - in3.numpy()[1, i])
            assert_np_equal(tape.gradients[in1].numpy(), expected_grads, tol=tol)

            # dy/din2 = in3
            expected_grads = np.zeros_like(in1.numpy())
            expected_grads[1, i] = 2.0 * in3.numpy()[1, i]
            assert_np_equal(tape.gradients[in2].numpy(), expected_grads, tol=tol)

            # dy/din3 = 8*in2 - 1.5*4*in1
            expected_grads = np.zeros_like(in1.numpy())
            expected_grads[1, i] = 2.0 * (in2.numpy()[1, i] - in1.numpy()[1, i])
            assert_np_equal(tape.gradients[in3].numpy(), expected_grads, tol=tol)
            tape.zero()


def test_clamp(test, device, dtype, register_kernels=False):
    rng = np.random.default_rng(123)

    tol = {
        np.float16: 5.0e-3,
        np.float32: 1.0e-6,
        np.float64: 1.0e-6,
    }.get(dtype, 0)

    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]

    def check_clamp(
        in1: wp.array(dtype=wptype),
        in2: wp.array(dtype=wptype),
        in3: wp.array(dtype=wptype),
        outputs: wp.array(dtype=wptype),
    ):
        for i in range(100):
            # multiply output by 2 so we've got something to backpropagate:
            outputs[i] = wptype(2) * wp.clamp(in1[i], in2[i], in3[i])

    kernel = getkernel(check_clamp, suffix=dtype.__name__)
    output_select_kernel = get_select_kernel(wptype)

    if register_kernels:
        return

    in1 = wp.array(randvals(rng, [100], dtype), dtype=wptype, requires_grad=True, device=device)
    starts = randvals(rng, [100], dtype)
    diffs = np.abs(randvals(rng, [100], dtype))
    in2 = wp.array(starts, dtype=wptype, requires_grad=True, device=device)
    in3 = wp.array(starts + diffs, dtype=wptype, requires_grad=True, device=device)
    outputs = wp.zeros_like(in1)

    wp.launch(kernel, dim=1, inputs=[in1, in2, in3], outputs=[outputs], device=device)

    assert_np_equal(2 * np.clip(in1.numpy(), in2.numpy(), in3.numpy()), outputs.numpy(), tol=tol)

    out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
    if dtype in np_float_types:
        for i in range(100):
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[in1, in2, in3], outputs=[outputs], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outputs, i], outputs=[out], device=device)

            tape.backward(loss=out)
            t = in1.numpy()[i]
            lower = in2.numpy()[i]
            upper = in3.numpy()[i]
            expected = np.zeros_like(in1.numpy())
            if t < lower:
                expected[i] = 2.0
                assert_np_equal(tape.gradients[in2].numpy(), expected, tol=tol)
                expected[i] = 0.0
                assert_np_equal(tape.gradients[in1].numpy(), expected, tol=tol)
                assert_np_equal(tape.gradients[in3].numpy(), expected, tol=tol)
            elif t > upper:
                expected[i] = 2.0
                assert_np_equal(tape.gradients[in3].numpy(), expected, tol=tol)
                expected[i] = 0.0
                assert_np_equal(tape.gradients[in1].numpy(), expected, tol=tol)
                assert_np_equal(tape.gradients[in2].numpy(), expected, tol=tol)
            else:
                expected[i] = 2.0
                assert_np_equal(tape.gradients[in1].numpy(), expected, tol=tol)
                expected[i] = 0.0
                assert_np_equal(tape.gradients[in2].numpy(), expected, tol=tol)
                assert_np_equal(tape.gradients[in3].numpy(), expected, tol=tol)

            tape.zero()


devices = get_test_devices()


class TestArithmetic(unittest.TestCase):
    pass


# these unary ops only make sense for signed values:
for dtype in np_signed_int_types + np_float_types:
    add_function_test_register_kernel(
        TestArithmetic, f"test_unary_ops_{dtype.__name__}", test_unary_ops, devices=devices, dtype=dtype
    )

for dtype in np_float_types:
    add_function_test_register_kernel(
        TestArithmetic, f"test_special_funcs_{dtype.__name__}", test_special_funcs, devices=devices, dtype=dtype
    )
    add_function_test_register_kernel(
        TestArithmetic,
        f"test_special_funcs_2arg_{dtype.__name__}",
        test_special_funcs_2arg,
        devices=devices,
        dtype=dtype,
    )
    add_function_test_register_kernel(
        TestArithmetic, f"test_interp_{dtype.__name__}", test_interp, devices=devices, dtype=dtype
    )
    add_function_test_register_kernel(
        TestArithmetic, f"test_float_to_int_{dtype.__name__}", test_float_to_int, devices=devices, dtype=dtype
    )

for dtype in np_scalar_types:
    add_function_test_register_kernel(
        TestArithmetic, f"test_clamp_{dtype.__name__}", test_clamp, devices=devices, dtype=dtype
    )
    add_function_test_register_kernel(
        TestArithmetic, f"test_nonzero_{dtype.__name__}", test_nonzero, devices=devices, dtype=dtype
    )
    add_function_test(TestArithmetic, f"test_arrays_{dtype.__name__}", test_arrays, devices=devices, dtype=dtype)
    add_function_test_register_kernel(
        TestArithmetic, f"test_binary_ops_{dtype.__name__}", test_binary_ops, devices=devices, dtype=dtype
    )


if __name__ == "__main__":
    wp.build.clear_kernel_cache()
    unittest.main(verbosity=2, failfast=False)