# Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

import unittest

import numpy as np

import warp as wp
from warp.tests.unittest_utils import *

wp.init()

np_signed_int_types = [
    np.int8,
    np.int16,
    np.int32,
    np.int64,
    np.byte,
]

np_float_types = [np.float16, np.float32, np.float64]


def randvals(rng, shape, dtype):
    if dtype in np_float_types:
        return rng.standard_normal(size=shape).astype(dtype)
    elif dtype in [np.int8, np.uint8, np.byte, np.ubyte]:
        return rng.integers(1, high=3, size=shape, dtype=dtype)
    return rng.integers(1, high=5, size=shape, dtype=dtype)


kernel_cache = dict()


def getkernel(func, suffix=""):
    key = func.__name__ + "_" + suffix
    if key not in kernel_cache:
        kernel_cache[key] = wp.Kernel(func=func, key=key)
    return kernel_cache[key]


def get_select_kernel(dtype):
    def output_select_kernel_fn(
        input: wp.array(dtype=dtype),
        index: int,
        out: wp.array(dtype=dtype),
    ):
        out[0] = input[index]

    return getkernel(output_select_kernel_fn, suffix=dtype.__name__)

    wp.launch(kernel, dim=1, inputs=[])


def test_anon_constructor_error_shape_keyword_missing(test, device):
    @wp.kernel
    def kernel():
        wp.matrix(1.0, 2.0, 3.0)

    with test.assertRaisesRegex(
        RuntimeError,
        r"shape keyword must be specified when calling matrix\(\) function$",
    ):
        wp.launch(
            kernel,
            dim=1,
            inputs=[],
            device=device,
        )


def test_anon_constructor_error_dtype_keyword_missing(test, device):
    @wp.kernel
    def kernel():
        wp.matrix(shape=(3, 3))

    with test.assertRaisesRegex(
        RuntimeError,
        r"matrix\(\) must have dtype as a keyword argument if it has no " r"positional arguments$",
    ):
        wp.launch(
            kernel,
            dim=1,
            inputs=[],
            device=device,
        )


def test_anon_constructor_error_shape_mismatch(test, device):
    @wp.kernel
    def kernel():
        wp.matrix(
            wp.matrix(shape=(1, 2), dtype=float),
            shape=(3, 4),
            dtype=float,
        )

    with test.assertRaisesRegex(
        RuntimeError,
        r"Incompatible matrix sizes for casting copy constructor, " r"\(3, 4\) vs \(1, 2\)$",
    ):
        wp.launch(
            kernel,
            dim=1,
            inputs=[],
            device=device,
        )


def test_anon_constructor_error_invalid_arg_count(test, device):
    @wp.kernel
    def kernel():
        wp.matrix(1.0, 2.0, 3.0, shape=(2, 2), dtype=float)

    with test.assertRaisesRegex(
        RuntimeError,
        r"Wrong number of arguments for matrix\(\) function, must initialize "
        r"with either a scalar value, or m\*n values$",
    ):
        wp.launch(
            kernel,
            dim=1,
            inputs=[],
            device=device,
        )


def test_tpl_constructor_error_incompatible_sizes(test, device):
    @wp.kernel
    def kernel():
        wp.mat33(wp.mat22(1.0, 2.0, 3.0, 4.0))

    with test.assertRaisesRegex(
        RuntimeError,
        r"Incompatible matrix sizes for casting copy constructor, " r"\(3, 3\) vs \(2, 2\)$",
    ):
        wp.launch(
            kernel,
            dim=1,
            inputs=[],
            device=device,
        )


def test_tpl_constructor_error_invalid_scalar_type(test, device):
    @wp.kernel
    def kernel():
        wp.mat22(1, 2, 3, 4)

    with test.assertRaisesRegex(
        RuntimeError,
        r"Wrong scalar type for mat 2,2,<class 'warp.types.float32'> constructor$",
    ):
        wp.launch(
            kernel,
            dim=1,
            inputs=[],
            device=device,
        )


def test_tpl_constructor_error_invalid_vector_count(test, device):
    @wp.kernel
    def kernel():
        wp.mat22(wp.vec3(1.0, 2.0, 3.0))

    with test.assertRaisesRegex(
        RuntimeError,
        r"Wrong number of vectors when attempting to construct a matrix " r"with column vectors$",
    ):
        wp.launch(
            kernel,
            dim=1,
            inputs=[],
            device=device,
        )


def test_tpl_constructor_error_invalid_vector_shape(test, device):
    @wp.kernel
    def kernel():
        wp.mat22(wp.vec3(1.0, 2.0, 3.0), wp.vec3(4.0, 5.0, 6.0))

    with test.assertRaisesRegex(
        RuntimeError,
        r"Wrong vector row count when attempting to construct a matrix " r"with column vectors$",
    ):
        wp.launch(
            kernel,
            dim=1,
            inputs=[],
            device=device,
        )


def test_tpl_constructor_error_invalid_arg_count(test, device):
    @wp.kernel
    def kernel():
        wp.mat22(1.0, 2.0, 3.0)

    with test.assertRaisesRegex(
        RuntimeError,
        r"Wrong number of scalars when attempting to construct a matrix " r"from a list of components$",
    ):
        wp.launch(
            kernel,
            dim=1,
            inputs=[],
            device=device,
        )


def test_tpl_ops_with_anon(test, device):
    mat22f = wp.mat((2, 2), dtype=float)

    m = wp.mat22f(1.0, 2.0, 3.0, 4.0)
    m += mat22f(2.0, 3.0, 4.0, 5.0)
    m -= mat22f(3.0, 4.0, 5.0, 6.0)
    test.assertSequenceEqual(m, ((0.0, 1.0), (2.0, 3.0)))

    m = mat22f(1.0, 2.0, 3.0, 4.0)
    m += wp.mat22f(2.0, 3.0, 4.0, 5.0)
    m -= wp.mat22f(3.0, 4.0, 5.0, 6.0)
    test.assertSequenceEqual(m, ((0.0, 1.0), (2.0, 3.0)))


def test_py_arithmetic_ops(test, device, dtype):
    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]

    def make_mat(*args):
        if wptype in wp.types.int_types:
            # Cast to the correct integer type to simulate wrapping.
            return tuple(tuple(wptype._type_(x).value for x in row) for row in args)

        return args

    def make_vec(*args):
        if wptype in wp.types.int_types:
            # Cast to the correct integer type to simulate wrapping.
            return tuple(wptype._type_(x).value for x in args)

        return args

    mat_cls = wp.mat((3, 3), wptype)
    vec_cls = wp.vec(3, wptype)

    m = mat_cls(((-1, 2, 3), (4, -5, 6), (7, 8, -9)))
    test.assertSequenceEqual(+m, make_mat((-1, 2, 3), (4, -5, 6), (7, 8, -9)))
    test.assertSequenceEqual(-m, make_mat((1, -2, -3), (-4, 5, -6), (-7, -8, 9)))
    test.assertSequenceEqual(m + mat_cls((5, 5, 5) * 3), make_mat((4, 7, 8), (9, 0, 11), (12, 13, -4)))
    test.assertSequenceEqual(m - mat_cls((5, 5, 5) * 3), make_mat((-6, -3, -2), (-1, -10, 1), (2, 3, -14)))
    test.assertSequenceEqual(m * vec_cls(5, 5, 5), make_vec(20, 25, 30))
    test.assertSequenceEqual(m @ vec_cls(5, 5, 5), make_vec(20, 25, 30))
    test.assertSequenceEqual(vec_cls(5, 5, 5) * m, make_vec(50, 25, 0))
    test.assertSequenceEqual(vec_cls(5, 5, 5) @ m, make_vec(50, 25, 0))

    m = mat_cls(((2, 4, 6), (8, 10, 12), (14, 16, 18)))
    test.assertSequenceEqual(m * wptype(2), make_mat((4, 8, 12), (16, 20, 24), (28, 32, 36)))
    test.assertSequenceEqual(wptype(2) * m, make_mat((4, 8, 12), (16, 20, 24), (28, 32, 36)))
    test.assertSequenceEqual(m / wptype(2), make_mat((1, 2, 3), (4, 5, 6), (7, 8, 9)))
    test.assertSequenceEqual(wptype(5040) / m, make_mat((2520, 1260, 840), (630, 504, 420), (360, 315, 280)))
    test.assertSequenceEqual(m * vec_cls(5, 5, 5), make_vec(60, 150, 240))
    test.assertSequenceEqual(m @ vec_cls(5, 5, 5), make_vec(60, 150, 240))
    test.assertSequenceEqual(vec_cls(5, 5, 5) * m, make_vec(120, 150, 180))
    test.assertSequenceEqual(vec_cls(5, 5, 5) @ m, make_vec(120, 150, 180))


def test_quat_constructor(test, device, dtype, register_kernels=False):
    rng = np.random.default_rng(123)

    tol = {
        np.float16: 1.0e-3,
        np.float32: 1.0e-6,
        np.float64: 1.0e-8,
    }.get(dtype, 0)

    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
    mat44 = wp.types.matrix(shape=(4, 4), dtype=wptype)
    vec4 = wp.types.vector(length=4, dtype=wptype)
    vec3 = wp.types.vector(length=3, dtype=wptype)
    quat = wp.types.quaternion(dtype=wptype)

    output_select_kernel = get_select_kernel(wptype)

    def check_mat_quat_constructor(
        p: wp.array(dtype=vec3),
        r: wp.array(dtype=quat),
        s: wp.array(dtype=vec3),
        outcomponents: wp.array(dtype=wptype),
        outcomponents_alt: wp.array(dtype=wptype),
    ):
        m = mat44(p[0], r[0], s[0])

        R = wp.transpose(wp.quat_to_matrix(r[0]))
        c0 = s[0][0] * R[0]
        c1 = s[0][1] * R[1]
        c2 = s[0][2] * R[2]
        m_alt = mat44(
            vec4(c0[0], c0[1], c0[2], wptype(0.0)),
            vec4(c1[0], c1[1], c1[2], wptype(0.0)),
            vec4(c2[0], c2[1], c2[2], wptype(0.0)),
            vec4(p[0][0], p[0][1], p[0][2], wptype(1.0)),
        )

        idx = 0
        for i in range(4):
            for j in range(4):
                outcomponents[idx] = m[i, j]
                outcomponents_alt[idx] = m_alt[i, j]
                idx = idx + 1

    kernel = getkernel(check_mat_quat_constructor, suffix=dtype.__name__)

    if register_kernels:
        return

    # translation:
    p = wp.array(rng.standard_normal(size=(1, 3)).astype(dtype), dtype=vec3, requires_grad=True, device=device)

    # generate a normalized quaternion for the rotation:
    r = rng.standard_normal(size=(1, 4))
    r /= np.linalg.norm(r)
    r = wp.array(r.astype(dtype), dtype=quat, requires_grad=True, device=device)

    # scale:
    s = wp.array(rng.standard_normal(size=(1, 3)).astype(dtype), dtype=vec3, requires_grad=True, device=device)

    # just going to generate the matrix using the constructor, then
    # more manually, and make sure the values/gradients are the same:
    outcomponents = wp.zeros(4 * 4, dtype=wptype, requires_grad=True, device=device)
    outcomponents_alt = wp.zeros(4 * 4, dtype=wptype, requires_grad=True, device=device)
    wp.launch(kernel, dim=1, inputs=[p, r, s], outputs=[outcomponents, outcomponents_alt], device=device)
    assert_np_equal(outcomponents.numpy(), outcomponents_alt.numpy(), tol=1.0e-6)

    idx = 0
    out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
    out_alt = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
    for i in range(4):
        for j in range(4):
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[p, r, s], outputs=[outcomponents, outcomponents_alt], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
                wp.launch(
                    output_select_kernel, dim=1, inputs=[outcomponents_alt, idx], outputs=[out_alt], device=device
                )

            tape.backward(loss=out)
            p_grad = 1.0 * tape.gradients[p].numpy()[0]
            r_grad = 1.0 * tape.gradients[r].numpy()[0]
            s_grad = 1.0 * tape.gradients[s].numpy()[0]
            tape.zero()

            tape.backward(loss=out_alt)
            p_grad_alt = 1.0 * tape.gradients[p].numpy()[0]
            r_grad_alt = 1.0 * tape.gradients[r].numpy()[0]
            s_grad_alt = 1.0 * tape.gradients[s].numpy()[0]
            tape.zero()

            assert_np_equal(p_grad, p_grad_alt, tol=tol)
            assert_np_equal(r_grad, r_grad_alt, tol=tol)
            assert_np_equal(s_grad, s_grad_alt, tol=tol)

            idx = idx + 1


def test_negation(test, device, dtype, register_kernels=False):
    rng = np.random.default_rng(123)

    tol = {
        np.float16: 1.0e-2,
        np.float32: 1.0e-6,
        np.float64: 1.0e-8,
    }.get(dtype, 0)

    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
    mat22 = wp.types.matrix(shape=(2, 2), dtype=wptype)
    mat33 = wp.types.matrix(shape=(3, 3), dtype=wptype)
    mat44 = wp.types.matrix(shape=(4, 4), dtype=wptype)
    mat55 = wp.types.matrix(shape=(5, 5), dtype=wptype)

    output_select_kernel = get_select_kernel(wptype)

    def check_mat_negation(
        m2: wp.array(dtype=mat22),
        m3: wp.array(dtype=mat33),
        m4: wp.array(dtype=mat44),
        m5: wp.array(dtype=mat55),
        outcomponents: wp.array(dtype=wptype),
    ):
        mat2 = -m2[0]
        mat3 = -m3[0]
        mat4 = -m4[0]
        mat5 = -m5[0]

        # multiply outputs by 2 so we've got something to backpropagate:
        idx = 0
        for i in range(2):
            for j in range(2):
                outcomponents[idx] = wptype(2) * mat2[i, j]
                idx = idx + 1

        for i in range(3):
            for j in range(3):
                outcomponents[idx] = wptype(2) * mat3[i, j]
                idx = idx + 1

        for i in range(4):
            for j in range(4):
                outcomponents[idx] = wptype(2) * mat4[i, j]
                idx = idx + 1

        for i in range(5):
            for j in range(5):
                outcomponents[idx] = wptype(2) * mat5[i, j]
                idx = idx + 1

    kernel = getkernel(check_mat_negation, suffix=dtype.__name__)

    if register_kernels:
        return

    m2 = wp.array(randvals(rng, [1, 2, 2], dtype), dtype=mat22, requires_grad=True, device=device)
    m3 = wp.array(randvals(rng, [1, 3, 3], dtype), dtype=mat33, requires_grad=True, device=device)
    m4 = wp.array(randvals(rng, [1, 4, 4], dtype), dtype=mat44, requires_grad=True, device=device)
    m5 = wp.array(randvals(rng, [1, 5, 5], dtype), dtype=mat55, requires_grad=True, device=device)
    outcomponents = wp.zeros(2 * 2 + 3 * 3 + 4 * 4 + 5 * 5, dtype=wptype, requires_grad=True, device=device)

    wp.launch(kernel, dim=1, inputs=[m2, m3, m4, m5], outputs=[outcomponents], device=device)

    assert_np_equal(outcomponents.numpy()[:4], -2 * m2.numpy().reshape(-1), tol=tol)
    assert_np_equal(outcomponents.numpy()[4:13], -2 * m3.numpy().reshape(-1), tol=tol)
    assert_np_equal(outcomponents.numpy()[13:29], -2 * m4.numpy().reshape(-1), tol=tol)
    assert_np_equal(outcomponents.numpy()[29:54], -2 * m5.numpy().reshape(-1), tol=tol)

    if dtype in np_float_types:
        idx = 0
        out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
        for dim, input in [(2, m2), (3, m3), (4, m4), (5, m5)]:
            for i in range(dim):
                for j in range(dim):
                    tape = wp.Tape()
                    with tape:
                        wp.launch(kernel, dim=1, inputs=[m2, m3, m4, m5], outputs=[outcomponents], device=device)
                        wp.launch(
                            output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device
                        )
                    tape.backward(loss=out)
                    expectedresult = np.zeros((dim, dim), dtype=dtype)
                    expectedresult[i, j] = -2
                    assert_np_equal(tape.gradients[input].numpy()[0], expectedresult)
                    tape.zero()
                    idx = idx + 1


def test_subtraction(test, device, dtype, register_kernels=False):
    rng = np.random.default_rng(123)

    tol = {
        np.float16: 5.0e-3,
        np.float32: 1.0e-6,
        np.float64: 1.0e-8,
    }.get(dtype, 0)

    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
    mat22 = wp.types.matrix(shape=(2, 2), dtype=wptype)
    mat33 = wp.types.matrix(shape=(3, 3), dtype=wptype)
    mat44 = wp.types.matrix(shape=(4, 4), dtype=wptype)
    mat55 = wp.types.matrix(shape=(5, 5), dtype=wptype)

    output_select_kernel = get_select_kernel(wptype)

    def check_mat_sub(
        s2: wp.array(dtype=mat22),
        s3: wp.array(dtype=mat33),
        s4: wp.array(dtype=mat44),
        s5: wp.array(dtype=mat55),
        v2: wp.array(dtype=mat22),
        v3: wp.array(dtype=mat33),
        v4: wp.array(dtype=mat44),
        v5: wp.array(dtype=mat55),
        outcomponents: wp.array(dtype=wptype),
    ):
        v2result = v2[0] - s2[0]
        v3result = v3[0] - s3[0]
        v4result = v4[0] - s4[0]
        v5result = v5[0] - s5[0]

        # multiply outputs by 2 so we've got something to backpropagate:
        idx = 0
        for i in range(2):
            for j in range(2):
                outcomponents[idx] = wptype(2) * v2result[i, j]
                idx = idx + 1

        for i in range(3):
            for j in range(3):
                outcomponents[idx] = wptype(2) * v3result[i, j]
                idx = idx + 1

        for i in range(4):
            for j in range(4):
                outcomponents[idx] = wptype(2) * v4result[i, j]
                idx = idx + 1

        for i in range(5):
            for j in range(5):
                outcomponents[idx] = wptype(2) * v5result[i, j]
                idx = idx + 1

    kernel = getkernel(check_mat_sub, suffix=dtype.__name__)

    if register_kernels:
        return

    s2 = wp.array(randvals(rng, [1, 2, 2], dtype), dtype=mat22, requires_grad=True, device=device)
    s3 = wp.array(randvals(rng, [1, 3, 3], dtype), dtype=mat33, requires_grad=True, device=device)
    s4 = wp.array(randvals(rng, [1, 4, 4], dtype), dtype=mat44, requires_grad=True, device=device)
    s5 = wp.array(randvals(rng, [1, 5, 5], dtype), dtype=mat55, requires_grad=True, device=device)
    v2 = wp.array(randvals(rng, [1, 2, 2], dtype), dtype=mat22, requires_grad=True, device=device)
    v3 = wp.array(randvals(rng, [1, 3, 3], dtype), dtype=mat33, requires_grad=True, device=device)
    v4 = wp.array(randvals(rng, [1, 4, 4], dtype), dtype=mat44, requires_grad=True, device=device)
    v5 = wp.array(randvals(rng, [1, 5, 5], dtype), dtype=mat55, requires_grad=True, device=device)
    outcomponents = wp.zeros(2 * 2 + 3 * 3 + 4 * 4 + 5 * 5, dtype=wptype, requires_grad=True, device=device)

    wp.launch(
        kernel,
        dim=1,
        inputs=[
            s2,
            s3,
            s4,
            s5,
            v2,
            v3,
            v4,
            v5,
        ],
        outputs=[outcomponents],
        device=device,
    )

    assert_np_equal(outcomponents.numpy()[:4], 2 * (v2.numpy() - s2.numpy()).reshape(-1), tol=tol)
    assert_np_equal(outcomponents.numpy()[4:13], 2 * (v3.numpy() - s3.numpy()).reshape(-1), tol=tol)
    assert_np_equal(outcomponents.numpy()[13:29], 2 * (v4.numpy() - s4.numpy()).reshape(-1), tol=tol)
    assert_np_equal(outcomponents.numpy()[29:54], 2 * (v5.numpy() - s5.numpy()).reshape(-1), tol=10 * tol)

    if dtype in np_float_types:
        idx = 0
        out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
        for dim, in1, in2 in [(2, s2, v2), (3, s3, v3), (4, s4, v4), (5, s5, v5)]:
            for i in range(dim):
                for j in range(dim):
                    tape = wp.Tape()
                    with tape:
                        wp.launch(
                            kernel,
                            dim=1,
                            inputs=[
                                s2,
                                s3,
                                s4,
                                s5,
                                v2,
                                v3,
                                v4,
                                v5,
                            ],
                            outputs=[outcomponents],
                            device=device,
                        )
                        wp.launch(
                            output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device
                        )
                    tape.backward(loss=out)
                    expectedresult = np.zeros((dim, dim), dtype=dtype)
                    expectedresult[i, j] = 2
                    assert_np_equal(tape.gradients[in2].numpy()[0], expectedresult, tol=10 * tol)
                    expectedresult[i, j] = -2
                    assert_np_equal(tape.gradients[in1].numpy()[0], expectedresult, tol=10 * tol)
                    tape.zero()

                    idx = idx + 1


def test_determinant(test, device, dtype, register_kernels=False):
    rng = np.random.default_rng(123)

    tol = {
        np.float16: 5.0e-3,
        np.float32: 1.0e-6,
        np.float64: 1.0e-8,
    }.get(dtype, 0)

    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
    mat22 = wp.types.matrix(shape=(2, 2), dtype=wptype)
    mat33 = wp.types.matrix(shape=(3, 3), dtype=wptype)
    mat44 = wp.types.matrix(shape=(4, 4), dtype=wptype)

    def check_mat_det(
        v2: wp.array(dtype=mat22),
        v3: wp.array(dtype=mat33),
        v4: wp.array(dtype=mat44),
        det2: wp.array(dtype=wptype),
        det3: wp.array(dtype=wptype),
        det4: wp.array(dtype=wptype),
    ):
        # multiply outputs by 2 so we've got something to backpropagate:
        det2[0] = wptype(2) * wp.determinant(v2[0])
        det3[0] = wptype(2) * wp.determinant(v3[0])
        det4[0] = wptype(2) * wp.determinant(v4[0])

    kernel = getkernel(check_mat_det, suffix=dtype.__name__)
    if register_kernels:
        return

    v2 = wp.array(randvals(rng, [1, 2, 2], dtype), dtype=mat22, requires_grad=True, device=device)
    v3 = wp.array(randvals(rng, [1, 3, 3], dtype), dtype=mat33, requires_grad=True, device=device)
    v4 = wp.array(randvals(rng, [1, 4, 4], dtype), dtype=mat44, requires_grad=True, device=device)
    det2 = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
    det3 = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
    det4 = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)

    tape = wp.Tape()
    with tape:
        wp.launch(
            kernel,
            dim=1,
            inputs=[
                v2,
                v3,
                v4,
            ],
            outputs=[
                det2,
                det3,
                det4,
            ],
            device=device,
        )

    if dtype in np_float_types:
        assert_np_equal(det2.numpy()[0], 2 * np.linalg.det(v2.numpy()[0].astype(np.float64)), tol=100 * tol)
        assert_np_equal(det3.numpy()[0], 2 * np.linalg.det(v3.numpy()[0].astype(np.float64)), tol=100 * tol)
        assert_np_equal(det4.numpy()[0], 2 * np.linalg.det(v4.numpy()[0].astype(np.float64)), tol=420 * tol)
    else:
        assert_np_equal(det2.numpy()[0], 2 * np.around(np.linalg.det(v2.numpy()[0])).astype(int))
        assert_np_equal(det3.numpy()[0], 2 * np.around(np.linalg.det(v3.numpy()[0])).astype(int))
        assert_np_equal(det4.numpy()[0], 2 * np.around(np.linalg.det(v4.numpy()[0])).astype(int))

    if dtype in np_float_types:
        # determinant derivative formula is annoying so finite differences?
        tape.backward(loss=det2)
        v2grads = 1.0 * tape.gradients[v2].numpy()[0]
        tape.zero()

        tape.backward(loss=det3)
        v3grads = 1.0 * tape.gradients[v3].numpy()[0]
        tape.zero()

        tape.backward(loss=det4)
        v4grads = 1.0 * tape.gradients[v4].numpy()[0]
        tape.zero()

        # finite differences are also annoying hence the large tolerance...
        # absolute nightmare in float16 too innit...
        dx = 0.01 if dtype == np.float16 else 0.0001
        fdtol = 2.0e-1 if dtype == np.float16 else 2.0e-3
        for i in range(2):
            for j in range(2):
                v2test = v2.numpy()
                v2test[0, i, j] += dx
                wp.launch(
                    kernel,
                    dim=1,
                    inputs=[
                        wp.array(v2test, dtype=v2.dtype, requires_grad=True, device=device),
                        v3,
                        v4,
                    ],
                    outputs=[
                        det2,
                        det3,
                        det4,
                    ],
                    device=device,
                )
                dplus = det2.numpy()[0]
                v2test[0, i, j] -= 2.0 * dx
                wp.launch(
                    kernel,
                    dim=1,
                    inputs=[
                        wp.array(v2test, dtype=v2.dtype, requires_grad=True, device=device),
                        v3,
                        v4,
                    ],
                    outputs=[
                        det2,
                        det3,
                        det4,
                    ],
                    device=device,
                )
                dminus = det2.numpy()[0]
                assert_np_equal((dplus - dminus) / (2.0 * dx * dplus), v2grads[i, j] / dplus, tol=fdtol)

        for i in range(3):
            for j in range(3):
                v3test = v3.numpy()
                v3test[0, i, j] += dx
                wp.launch(
                    kernel,
                    dim=1,
                    inputs=[
                        v2,
                        wp.array(v3test, dtype=v3.dtype, requires_grad=True, device=device),
                        v4,
                    ],
                    outputs=[
                        det2,
                        det3,
                        det4,
                    ],
                    device=device,
                )
                dplus = det3.numpy()[0]
                v3test[0, i, j] -= 2.0 * dx
                wp.launch(
                    kernel,
                    dim=1,
                    inputs=[
                        v2,
                        wp.array(v3test, dtype=v3.dtype, requires_grad=True, device=device),
                        v4,
                    ],
                    outputs=[
                        det2,
                        det3,
                        det4,
                    ],
                    device=device,
                )
                dminus = det3.numpy()[0]
                assert_np_equal((dplus - dminus) / (2.0 * dx * dplus), v3grads[i, j] / dplus, tol=fdtol)

        for i in range(4):
            for j in range(4):
                v4test = v4.numpy()
                v4test[0, i, j] += dx
                wp.launch(
                    kernel,
                    dim=1,
                    inputs=[
                        v2,
                        v3,
                        wp.array(v4test, dtype=v4.dtype, requires_grad=True, device=device),
                    ],
                    outputs=[
                        det2,
                        det3,
                        det4,
                    ],
                    device=device,
                )
                dplus = det4.numpy()[0]
                v4test[0, i, j] -= 2.0 * dx
                wp.launch(
                    kernel,
                    dim=1,
                    inputs=[
                        v2,
                        v3,
                        wp.array(v4test, dtype=v4.dtype, requires_grad=True, device=device),
                    ],
                    outputs=[
                        det2,
                        det3,
                        det4,
                    ],
                    device=device,
                )
                dminus = det4.numpy()[0]
                assert_np_equal((dplus - dminus) / (2.0 * dx * dplus), v4grads[i, j] / dplus, tol=fdtol)


# Unused. Why?
# def test_get_diag(test, device, dtype, register_kernels=False):
#     tol = {
#         np.float16: 1.0e-3,
#         np.float32: 1.0e-6,
#         np.float64: 1.0e-8,
#     }.get(dtype, 0)
#
#     wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
#     mat55 = wp.types.vector(shape=(5, 5), dtype=wptype)
#
#     output_select_kernel = get_select_kernel(wptype)
#
#     def check_mat_diag(
#         m55: wp.array(dtype=mat55),
#         outcomponents: wp.array(dtype=wptype),
#     ):
#         # multiply outputs by 2 so we've got something to backpropagate:
#         vec5result = wptype(2) * wp.get_diag(m55[0])
#
#         idx = 0
#         for i in range(5):
#             outcomponents[idx] = vec5result[i]
#             idx = idx + 1
#
#     kernel = getkernel(check_mat_diag, suffix=dtype.__name__)
#
#     if register_kernels:
#         return
#
#     m55 = wp.array(randvals((1, 5, 5), dtype), dtype=mat55, requires_grad=True, device=device)
#     outcomponents = wp.zeros(5, dtype=wptype, requires_grad=True, device=device)
#     out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
#
#     wp.launch(kernel, dim=1, inputs=[m55], outputs=[outcomponents], device=device)
#
#     assert_np_equal(outcomponents.numpy(), 2 * np.diag(m55.numpy()[0]), tol=tol)
#
#     if dtype in np_float_types:
#         idx = 0
#         for i in range(5):
#             tape = wp.Tape()
#             with tape:
#                 wp.launch(kernel, dim=1, inputs=[m55], outputs=[outcomponents], device=device)
#                 wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
#             tape.backward(loss=out)
#             expectedresult = np.zeros((5, 5), dtype=dtype)
#             expectedresult[i, i] = 2
#             assert_np_equal(tape.gradients[m55].numpy()[0], expectedresult, tol=10 * tol)
#             tape.zero()
#
#             idx = idx + 1


def test_inverse(test, device, dtype, register_kernels=False):
    rng = np.random.default_rng(123)

    tol = {
        np.float16: 5.0e-2,
        np.float32: 1.0e-5,
        np.float64: 1.0e-8,
    }.get(dtype, 0)

    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
    mat22 = wp.types.matrix(shape=(2, 2), dtype=wptype)
    mat33 = wp.types.matrix(shape=(3, 3), dtype=wptype)
    mat44 = wp.types.matrix(shape=(4, 4), dtype=wptype)

    output_select_kernel = get_select_kernel(wptype)

    def check_mat_inverse(
        m2: wp.array(dtype=mat22),
        m3: wp.array(dtype=mat33),
        m4: wp.array(dtype=mat44),
        outcomponents: wp.array(dtype=wptype),
    ):
        m2result = wp.inverse(m2[0])
        m3result = wp.inverse(m3[0])
        m4result = wp.inverse(m4[0])

        # multiply outputs by 2 so we've got something to backpropagate:
        idx = 0
        for i in range(2):
            for j in range(2):
                outcomponents[idx] = wptype(2) * m2result[i, j]
                idx = idx + 1

        for i in range(3):
            for j in range(3):
                outcomponents[idx] = wptype(2) * m3result[i, j]
                idx = idx + 1

        for i in range(4):
            for j in range(4):
                outcomponents[idx] = wptype(2) * m4result[i, j]
                idx = idx + 1

    kernel = getkernel(check_mat_inverse, suffix=dtype.__name__)

    if register_kernels:
        return

    m2 = wp.array(
        2 * (randvals(rng, [1, 2, 2], dtype) + 0.2 * np.eye(2)), dtype=mat22, requires_grad=True, device=device
    )
    m3 = wp.array(
        2 * (randvals(rng, [1, 3, 3], dtype) + 0.2 * np.eye(3)), dtype=mat33, requires_grad=True, device=device
    )
    m4 = wp.array(
        2 * (randvals(rng, [1, 4, 4], dtype) + 0.2 * np.eye(4)), dtype=mat44, requires_grad=True, device=device
    )

    outcomponents = wp.zeros(2 * 2 + 3 * 3 + 4 * 4, dtype=wptype, requires_grad=True, device=device)
    out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)

    wp.launch(kernel, dim=1, inputs=[m2, m3, m4], outputs=[outcomponents], device=device)

    assert_np_equal(outcomponents.numpy()[:4], 2 * np.linalg.inv(m2.numpy()[0].astype(np.float64)), tol=tol)
    assert_np_equal(outcomponents.numpy()[4:13], 2 * np.linalg.inv(m3.numpy()[0].astype(np.float64)), tol=5 * tol)
    assert_np_equal(outcomponents.numpy()[13:], 2 * np.linalg.inv(m4.numpy()[0].astype(np.float64)), tol=5 * tol)

    if dtype in np_float_types:
        # check gradients:
        idx = 0
        out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
        for dim, input in [(2, m2), (3, m3), (4, m4)]:
            minv = np.linalg.inv(input.numpy()[0].astype(np.float64))
            for i in range(dim):
                for j in range(dim):
                    tape = wp.Tape()
                    with tape:
                        wp.launch(kernel, dim=1, inputs=[m2, m3, m4], outputs=[outcomponents], device=device)
                        wp.launch(
                            output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device
                        )
                    tape.backward(loss=out)
                    d = np.zeros((dim, dim))
                    d[j, i] = 2
                    assert_np_equal(
                        tape.gradients[input].numpy()[0], -np.matmul(minv, np.matmul(d, minv)).T, tol=10 * tol
                    )
                    tape.zero()

                    idx = idx + 1

    # let's check 2x2 using different formulae just for (in)sanity's sake:
    m = m2.numpy()[0]

    det = m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]
    expected = 2 * np.array([[m[1, 1], -m[0, 1]], [-m[1, 0], m[0, 0]]], dtype=dtype) / det
    assert_np_equal(expected, outcomponents.numpy()[:4], tol=tol)

    # 0,0 component is this:
    # 2 * m[1,1] / (m[0,0]*m[1,1] - m[1,0] * m[0,1])
    assert_np_equal(2 * m[1, 1] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]), outcomponents.numpy()[0], tol=tol)

    tape = wp.Tape()
    with tape:
        wp.launch(kernel, dim=1, inputs=[m2, m3, m4], outputs=[outcomponents], device=device)
        wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, 0], outputs=[out], device=device)

    if dtype in np_float_types:
        tape.backward(loss=out)
        g = tape.gradients[m2].numpy()[0]
        assert_np_equal(-2 * m[1, 1] * m[1, 1] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[0, 0], tol=tol)
        assert_np_equal(2 * m[1, 1] * m[0, 1] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[1, 0], tol=tol)
        assert_np_equal(-2 * m[0, 1] * m[1, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[1, 1], tol=tol)
        assert_np_equal(2 * m[1, 1] * m[1, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[0, 1], tol=tol)
        tape.zero()

    # 0,1 component is this:
    # -2 * m[0,1] / (m[0,0]*m[1,1] - m[1,0] * m[0,1])
    assert_np_equal(-2 * m[0, 1] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]), outcomponents.numpy()[1], tol=tol)

    tape = wp.Tape()
    with tape:
        wp.launch(kernel, dim=1, inputs=[m2, m3, m4], outputs=[outcomponents], device=device)
        wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, 1], outputs=[out], device=device)
    if dtype in np_float_types:
        tape.backward(loss=out)
        g = tape.gradients[m2].numpy()[0]
        assert_np_equal(2 * m[0, 1] * m[1, 1] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[0, 0], tol=tol)
        assert_np_equal(-2 * m[0, 1] * m[0, 1] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[1, 0], tol=tol)
        assert_np_equal(2 * m[0, 0] * m[0, 1] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[1, 1], tol=tol)
        assert_np_equal(-2 * m[1, 1] * m[0, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[0, 1], tol=tol)
        tape.zero()

    # 1,0 component is this:
    # -2 * m[1,0] / (m[0,0]*m[1,1] - m[1,0] * m[0,1])
    assert_np_equal(-2 * m[1, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]), outcomponents.numpy()[2], tol=tol)

    tape = wp.Tape()
    with tape:
        wp.launch(kernel, dim=1, inputs=[m2, m3, m4], outputs=[outcomponents], device=device)
        wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, 2], outputs=[out], device=device)

    if dtype in np_float_types:
        tape.backward(loss=out)
        g = tape.gradients[m2].numpy()[0]
        assert_np_equal(2 * m[1, 1] * m[1, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[0, 0], tol=tol)
        assert_np_equal(-2 * m[0, 0] * m[1, 1] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[1, 0], tol=tol)
        assert_np_equal(2 * m[0, 0] * m[1, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[1, 1], tol=tol)
        assert_np_equal(-2 * m[1, 0] * m[1, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[0, 1], tol=tol)
        tape.zero()

    # 1,1 component is this:
    # 2 * m[0,0] / (m[0,0]*m[1,1] - m[1,0] * m[0,1])
    assert_np_equal(2 * m[0, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]), outcomponents.numpy()[3], tol=tol)

    tape = wp.Tape()
    with tape:
        wp.launch(kernel, dim=1, inputs=[m2, m3, m4], outputs=[outcomponents], device=device)
        wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, 3], outputs=[out], device=device)

    if dtype in np_float_types:
        tape.backward(loss=out)
        g = tape.gradients[m2].numpy()[0]
        assert_np_equal(-2 * m[0, 1] * m[1, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[0, 0], tol=tol)
        assert_np_equal(2 * m[0, 0] * m[0, 1] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[1, 0], tol=tol)
        assert_np_equal(2 * m[0, 0] * m[1, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[0, 1], tol=tol)
        assert_np_equal(-2 * m[0, 0] * m[0, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[1, 1], tol=tol)
        tape.zero()


def test_svd(test, device, dtype, register_kernels=False):
    rng = np.random.default_rng(123)

    tol = {
        np.float16: 1.0e-3,
        np.float32: 1.0e-6,
        np.float64: 1.0e-6,
    }.get(dtype, 0)

    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
    vec3 = wp.types.vector(length=3, dtype=wptype)
    mat33 = wp.types.matrix(shape=(3, 3), dtype=wptype)

    def check_mat_svd(
        m3: wp.array(dtype=mat33),
        Uout: wp.array(dtype=mat33),
        sigmaout: wp.array(dtype=vec3),
        Vout: wp.array(dtype=mat33),
        outcomponents: wp.array(dtype=wptype),
    ):
        U = mat33()
        sigma = vec3()
        V = mat33()

        wp.svd3(m3[0], U, sigma, V)

        Uout[0] = U
        sigmaout[0] = sigma
        Vout[0] = V

        # multiply outputs by 2 so we've got something to backpropagate:
        idx = 0
        for i in range(3):
            for j in range(3):
                outcomponents[idx] = wptype(2) * U[i, j]
                idx = idx + 1

        for i in range(3):
            outcomponents[idx] = wptype(2) * sigma[i]
            idx = idx + 1

        for i in range(3):
            for j in range(3):
                outcomponents[idx] = wptype(2) * V[i, j]
                idx = idx + 1

    kernel = getkernel(check_mat_svd, suffix=dtype.__name__)

    output_select_kernel = get_select_kernel(wptype)

    if register_kernels:
        return

    m3 = wp.array(randvals(rng, [1, 3, 3], dtype) + np.eye(3), dtype=mat33, requires_grad=True, device=device)

    outcomponents = wp.zeros(2 * 3 * 3 + 3, dtype=wptype, requires_grad=True, device=device)
    Uout = wp.zeros(1, dtype=mat33, requires_grad=True, device=device)
    sigmaout = wp.zeros(1, dtype=vec3, requires_grad=True, device=device)
    Vout = wp.zeros(1, dtype=mat33, requires_grad=True, device=device)

    wp.launch(kernel, dim=1, inputs=[m3], outputs=[Uout, sigmaout, Vout, outcomponents], device=device)

    Uout_np = Uout.numpy()[0].astype(np.float64)
    sigmaout_np = np.diag(sigmaout.numpy()[0].astype(np.float64))
    Vout_np = Vout.numpy()[0].astype(np.float64)

    assert_np_equal(
        np.matmul(Uout_np, np.matmul(sigmaout_np, Vout_np.T)), m3.numpy()[0].astype(np.float64), tol=30 * tol
    )

    if dtype == np.float16:
        # I'm not even going to bother testing the gradients for float16
        # because the rounding errors are terrible...
        return

    # check gradients:
    out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
    idx = 0
    for idx in range(3 * 3 + 3 + 3 * 3):
        tape = wp.Tape()
        with tape:
            wp.launch(kernel, dim=1, inputs=[m3], outputs=[Uout, sigmaout, Vout, outcomponents], device=device)
            wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
        tape.backward(out)
        m3grads = 1.0 * tape.gradients[m3].numpy()[0]

        tape.zero()

        dx = 0.0001
        fdtol = 5.0e-4 if dtype == np.float64 else 2.0e-2
        for ii in range(3):
            for jj in range(3):
                m3test = 1.0 * m3.numpy()
                m3test[0, ii, jj] += dx
                wp.launch(
                    kernel,
                    dim=1,
                    inputs=[wp.array(m3test, dtype=mat33, device=device)],
                    outputs=[Uout, sigmaout, Vout, outcomponents],
                    device=device,
                )
                wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
                plusval = out.numpy()[0]

                m3test = 1.0 * m3.numpy()
                m3test[0, ii, jj] -= dx
                wp.launch(
                    kernel,
                    dim=1,
                    inputs=[wp.array(m3test, dtype=mat33, device=device)],
                    outputs=[Uout, sigmaout, Vout, outcomponents],
                    device=device,
                )
                wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
                minusval = out.numpy()[0]

                assert_np_equal((plusval - minusval) / (2 * dx), m3grads[ii, jj], tol=fdtol)


def test_qr(test, device, dtype, register_kernels=False):
    rng = np.random.default_rng(123)

    tol = {
        np.float16: 2.0e-3,
        np.float32: 1.0e-6,
        np.float64: 1.0e-6,
    }.get(dtype, 0)

    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
    mat33 = wp.types.matrix(shape=(3, 3), dtype=wptype)

    def check_mat_qr(
        m3: wp.array(dtype=mat33),
        Qout: wp.array(dtype=mat33),
        Rout: wp.array(dtype=mat33),
        outcomponents: wp.array(dtype=wptype),
    ):
        Q = mat33()
        R = mat33()

        wp.qr3(m3[0], Q, R)

        Qout[0] = Q
        Rout[0] = R

        # multiply outputs by 2 so we've got something to backpropagate:
        idx = 0
        for i in range(3):
            for j in range(3):
                outcomponents[idx] = wptype(2) * Q[i, j]
                idx = idx + 1

        for i in range(3):
            for j in range(3):
                outcomponents[idx] = wptype(2) * R[i, j]
                idx = idx + 1

    kernel = getkernel(check_mat_qr, suffix=dtype.__name__)
    output_select_kernel = get_select_kernel(wptype)

    if register_kernels:
        return

    m3 = wp.array(0.5 * (randvals(rng, [1, 3, 3], dtype) + np.eye(3)), dtype=mat33, requires_grad=True, device=device)

    outcomponents = wp.zeros(2 * 3 * 3, dtype=wptype, requires_grad=True, device=device)
    Qout = wp.zeros(1, dtype=mat33, requires_grad=True, device=device)
    Rout = wp.zeros(1, dtype=mat33, requires_grad=True, device=device)

    wp.launch(kernel, dim=1, inputs=[m3], outputs=[Qout, Rout, outcomponents], device=device)

    Qout_np = Qout.numpy()[0].astype(np.float64)
    Rout_np = Rout.numpy()[0].astype(np.float64)

    # check it's actually a q and an r:
    assert_np_equal(np.matmul(Qout_np.T, Qout_np), np.eye(3, dtype=np.float64), tol=tol)
    assert_np_equal(Rout_np[1, [0]], np.zeros(1, dtype=np.float64), tol=tol)
    assert_np_equal(Rout_np[2, [0, 1]], np.zeros(2, dtype=np.float64), tol=tol)

    # check it's a factorization:
    assert_np_equal(np.matmul(Qout_np, Rout_np), m3.numpy()[0].astype(np.float64), tol=30 * tol)

    if dtype == np.float16:
        # I'm not even going to bother testing the gradients for float16
        # because the rounding errors are terrible...
        return

    # check gradients:
    out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
    idx = 0
    for idx in range(len(outcomponents)):
        tape = wp.Tape()
        with tape:
            wp.launch(kernel, dim=1, inputs=[m3], outputs=[Qout, Rout, outcomponents], device=device)
            wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
        tape.backward(out)
        m3grads = 1.0 * tape.gradients[m3].numpy()[0]

        tape.zero()

        dx = 0.0001
        fdtol = 5.0e-4 if dtype == np.float64 else 2.0e-2
        for ii in range(3):
            for jj in range(3):
                m3test = 1.0 * m3.numpy()
                m3test[0, ii, jj] += dx
                wp.launch(
                    kernel,
                    dim=1,
                    inputs=[wp.array(m3test, dtype=mat33, device=device)],
                    outputs=[Qout, Rout, outcomponents],
                    device=device,
                )
                wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
                plusval = out.numpy()[0]

                m3test = 1.0 * m3.numpy()
                m3test[0, ii, jj] -= dx
                wp.launch(
                    kernel,
                    dim=1,
                    inputs=[wp.array(m3test, dtype=mat33, device=device)],
                    outputs=[Qout, Rout, outcomponents],
                    device=device,
                )
                wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
                minusval = out.numpy()[0]

                assert_np_equal((plusval - minusval) / (2 * dx), m3grads[ii, jj], tol=fdtol)


def test_eig(test, device, dtype, register_kernels=False):
    rng = np.random.default_rng(123)

    tol = {
        np.float16: 4.0e-2,
        np.float32: 1.0e-5,
        np.float64: 1.0e-5,
    }.get(dtype, 0)

    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
    vec3 = wp.types.vector(length=3, dtype=wptype)
    mat33 = wp.types.matrix(shape=(3, 3), dtype=wptype)

    def check_mat_eig(
        m3: wp.array(dtype=mat33),
        Qout: wp.array(dtype=mat33),
        dout: wp.array(dtype=vec3),
        outcomponents: wp.array(dtype=wptype),
    ):
        Q = mat33()
        d = vec3()

        wp.eig3(m3[0] + wp.transpose(m3[0]), Q, d)

        Qout[0] = Q
        dout[0] = d

        # multiply outputs by 2 so we've got something to backpropagate:
        idx = 0
        for i in range(3):
            for j in range(3):
                outcomponents[idx] = wptype(2) * Q[i, j]
                idx = idx + 1

        for i in range(3):
            outcomponents[idx] = wptype(2) * d[i]
            idx = idx + 1

    kernel = getkernel(check_mat_eig, suffix=dtype.__name__)
    output_select_kernel = get_select_kernel(wptype)

    if register_kernels:
        return

    m3_np = randvals(rng, [1, 3, 3], dtype) + np.eye(3, dtype=dtype)
    m3 = wp.array(m3_np, dtype=mat33, requires_grad=True, device=device)

    outcomponents = wp.zeros(3 * 3 + 3, dtype=wptype, requires_grad=True, device=device)
    Qout = wp.zeros(1, dtype=mat33, requires_grad=True, device=device)
    dout = wp.zeros(1, dtype=vec3, requires_grad=True, device=device)

    wp.launch(kernel, dim=1, inputs=[m3], outputs=[Qout, dout, outcomponents], device=device)

    Qout_np = Qout.numpy()[0].astype(np.float64)
    dout_np = dout.numpy()[0].astype(np.float64)
    Dout_np = np.diag(dout_np)

    # check Q is orthogonal:
    assert_np_equal(np.matmul(Qout_np.T, Qout_np), np.eye(3), tol=tol)

    # check Q contains eigenvectors:
    assert_np_equal(np.matmul(Qout_np, np.matmul(Dout_np, Qout_np.T)), (m3_np[0] + m3_np[0].transpose()), tol=tol)

    if dtype == np.float16:
        # I'm not even going to bother testing the gradients for float16
        # because the rounding errors are terrible...
        return

    # check gradients:
    out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
    idx = 0
    for idx in range(len(outcomponents)):
        tape = wp.Tape()
        with tape:
            wp.launch(kernel, dim=1, inputs=[m3], outputs=[Qout, dout, outcomponents], device=device)
            wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
        tape.backward(out)
        m3grads = 1.0 * tape.gradients[m3].numpy()[0]

        tape.zero()

        dx = 0.0001
        fdtol = 5.0e-4 if dtype == np.float64 else 2.0e-2
        for ii in range(3):
            for jj in range(3):
                m3test = 1.0 * m3.numpy()
                m3test[0, ii, jj] += dx
                wp.launch(
                    kernel,
                    dim=1,
                    inputs=[wp.array(m3test, dtype=mat33, device=device)],
                    outputs=[Qout, dout, outcomponents],
                    device=device,
                )
                wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
                plusval = out.numpy()[0]

                m3test = 1.0 * m3.numpy()
                m3test[0, ii, jj] -= dx
                wp.launch(
                    kernel,
                    dim=1,
                    inputs=[wp.array(m3test, dtype=mat33, device=device)],
                    outputs=[Qout, dout, outcomponents],
                    device=device,
                )
                wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
                minusval = out.numpy()[0]

                assert_np_equal((plusval - minusval) / (2 * dx), m3grads[ii, jj], tol=fdtol)


def test_skew(test, device, dtype, register_kernels=False):
    rng = np.random.default_rng(123)

    tol = {
        np.float16: 1.0e-3,
        np.float32: 1.0e-6,
        np.float64: 1.0e-8,
    }.get(dtype, 0)

    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
    vec3 = wp.types.vector(length=3, dtype=wptype)

    output_select_kernel = get_select_kernel(wptype)

    def check_mat_skew(
        v3: wp.array(dtype=vec3),
        outcomponents: wp.array(dtype=wptype),
    ):
        m3result = wp.skew(v3[0])

        # multiply outputs by 2 so we've got something to backpropagate:
        idx = 0
        for i in range(3):
            for j in range(3):
                outcomponents[idx] = wptype(2) * m3result[i, j]
                idx = idx + 1

    kernel = getkernel(check_mat_skew, suffix=dtype.__name__)

    if register_kernels:
        return

    v3 = wp.array(randvals(rng, [1, 3], dtype), dtype=vec3, requires_grad=True, device=device)

    outcomponents = wp.zeros(3 * 3, dtype=wptype, requires_grad=True, device=device)
    out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)

    wp.launch(kernel, dim=1, inputs=[v3], outputs=[outcomponents], device=device)

    # make sure it gives you a cross product matrix:
    crossprodmat = outcomponents.numpy().reshape(3, 3)
    v = np.array([1, 0, 0])
    assert_np_equal(
        np.matmul(crossprodmat, np.array([1, 0, 0])).reshape(-1),
        2 * np.cross(v3.numpy()[0], np.array([1, 0, 0])),
        tol=tol,
    )
    assert_np_equal(
        np.matmul(crossprodmat, np.array([0, 1, 0])).reshape(-1),
        2 * np.cross(v3.numpy()[0], np.array([0, 1, 0])),
        tol=tol,
    )
    assert_np_equal(
        np.matmul(crossprodmat, np.array([0, 0, 1])).reshape(-1),
        2 * np.cross(v3.numpy()[0], np.array([0, 0, 1])),
        tol=tol,
    )

    # check it another way:
    x0 = v3.numpy()[0, 0]
    x1 = v3.numpy()[0, 1]
    x2 = v3.numpy()[0, 2]
    crossprodmat_expected = np.array(
        [
            [0, -x2, x1],
            [x2, 0, -x0],
            [-x1, x0, 0],
        ],
        dtype=dtype,
    )
    assert_np_equal(crossprodmat, 2 * crossprodmat_expected, tol=tol)

    if dtype in np_float_types:
        idx = 0
        out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)

        for i in range(3):
            for j in range(3):
                tape = wp.Tape()
                with tape:
                    wp.launch(kernel, dim=1, inputs=[v3], outputs=[outcomponents], device=device)
                    wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
                tape.backward(loss=out)
                if i == j:
                    assert_np_equal(tape.gradients[v3].numpy()[0], np.zeros(3))
                elif [i, j] == [0, 1]:
                    assert_np_equal(tape.gradients[v3].numpy()[0], np.array([0, 0, -2]))
                elif [i, j] == [1, 0]:
                    assert_np_equal(tape.gradients[v3].numpy()[0], np.array([0, 0, 2]))
                elif [i, j] == [0, 2]:
                    assert_np_equal(tape.gradients[v3].numpy()[0], np.array([0, 2, 0]))
                elif [i, j] == [2, 0]:
                    assert_np_equal(tape.gradients[v3].numpy()[0], np.array([0, -2, 0]))
                elif [i, j] == [1, 2]:
                    assert_np_equal(tape.gradients[v3].numpy()[0], np.array([-2, 0, 0]))
                elif [i, j] == [2, 1]:
                    assert_np_equal(tape.gradients[v3].numpy()[0], np.array([2, 0, 0]))
                tape.zero()

                idx = idx + 1


def test_transform_point(test, device, dtype, register_kernels=False):
    rng = np.random.default_rng(123)

    tol = {
        np.float16: 5.0e-3,
        np.float32: 1.0e-6,
        np.float64: 1.0e-8,
    }.get(dtype, 0)

    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
    vec3 = wp.types.vector(length=3, dtype=wptype)
    mat44 = wp.types.matrix(shape=(4, 4), dtype=wptype)

    output_select_kernel = get_select_kernel(wptype)

    def check_mat_transform_point(
        v3: wp.array(dtype=vec3),
        m4: wp.array(dtype=mat44),
        outcomponents: wp.array(dtype=wptype),
    ):
        # multiply outputs by 2 so we've got something to backpropagate:
        presult = wptype(2) * wp.transform_point(m4[0], v3[0])

        outcomponents[0] = presult[0]
        outcomponents[1] = presult[1]
        outcomponents[2] = presult[2]

    kernel = getkernel(check_mat_transform_point, suffix=dtype.__name__)

    if register_kernels:
        return

    v3 = wp.array(randvals(rng, [1, 3], dtype), dtype=vec3, requires_grad=True, device=device)
    m4 = wp.array(randvals(rng, [1, 4, 4], dtype), dtype=mat44, requires_grad=True, device=device)

    outcomponents = wp.zeros(3, dtype=wptype, requires_grad=True, device=device)
    out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)

    wp.launch(kernel, dim=1, inputs=[v3, m4], outputs=[outcomponents], device=device)

    v3homog = np.ones(4, dtype=dtype)
    v3homog[:3] = v3.numpy()[0]
    assert_np_equal(outcomponents.numpy(), 2 * np.matmul(m4.numpy()[0], v3homog)[:3], tol=10 * tol)

    if dtype in np_float_types:
        for j in range(3):
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[v3, m4], outputs=[outcomponents], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, j], outputs=[out], device=device)
            tape.backward(loss=out)

            assert_np_equal(2 * m4.numpy()[0, j, :3], tape.gradients[v3].numpy(), tol=tol)
            expected = np.zeros((4, 4), dtype=dtype)
            expected[j, :3] = 2 * v3.numpy()
            expected[j, 3] = 2
            assert_np_equal(tape.gradients[m4].numpy(), expected, tol=tol)

            tape.zero()


def test_transform_vector(test, device, dtype, register_kernels=False):
    rng = np.random.default_rng(123)

    tol = {
        np.float16: 5.0e-3,
        np.float32: 1.0e-6,
        np.float64: 1.0e-8,
    }.get(dtype, 0)

    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
    vec3 = wp.types.vector(length=3, dtype=wptype)
    mat44 = wp.types.matrix(shape=(4, 4), dtype=wptype)

    output_select_kernel = get_select_kernel(wptype)

    def check_mat_transform_vector(
        v3: wp.array(dtype=vec3),
        m4: wp.array(dtype=mat44),
        outcomponents: wp.array(dtype=wptype),
    ):
        # multiply outputs by 2 so we've got something to backpropagate:
        presult = wptype(2) * wp.transform_vector(m4[0], v3[0])

        outcomponents[0] = presult[0]
        outcomponents[1] = presult[1]
        outcomponents[2] = presult[2]

    kernel = getkernel(check_mat_transform_vector, suffix=dtype.__name__)

    if register_kernels:
        return

    v3 = wp.array(randvals(rng, [1, 3], dtype), dtype=vec3, requires_grad=True, device=device)
    m4 = wp.array(randvals(rng, [1, 4, 4], dtype), dtype=mat44, requires_grad=True, device=device)

    outcomponents = wp.zeros(3, dtype=wptype, requires_grad=True, device=device)
    out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)

    wp.launch(kernel, dim=1, inputs=[v3, m4], outputs=[outcomponents], device=device)

    v3homog = np.zeros(4, dtype=dtype)
    v3homog[:3] = v3.numpy()[0]
    assert_np_equal(outcomponents.numpy(), 2 * np.matmul(m4.numpy()[0], v3homog)[:3], tol=10 * tol)

    if dtype in np_float_types:
        for j in range(3):
            tape = wp.Tape()
            with tape:
                wp.launch(kernel, dim=1, inputs=[v3, m4], outputs=[outcomponents], device=device)
                wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, j], outputs=[out], device=device)
            tape.backward(loss=out)

            assert_np_equal(2 * m4.numpy()[0, j, :3], tape.gradients[v3].numpy(), tol=tol)
            expected = np.zeros((4, 4), dtype=dtype)
            expected[j, :3] = 2 * v3.numpy()
            assert_np_equal(tape.gradients[m4].numpy(), expected, tol=tol)

            tape.zero()


# Test matrix constructors using explicit type (float16)
# note that these tests are specifically not using generics / closure
# args to create kernels dynamically (like the rest of this file)
# as those use different code paths to resolve arg types which
# has lead to regressions.
@wp.kernel
def test_constructors_explicit_precision():
    # construction for custom matrix types
    eye = wp.identity(dtype=wp.float16, n=2)
    zeros = wp.matrix(shape=(2, 2), dtype=wp.float16)
    custom = wp.matrix(wp.float16(0.0), wp.float16(1.0), wp.float16(2.0), wp.float16(3.0), shape=(2, 2))

    for i in range(2):
        for j in range(2):
            if i == j:
                wp.expect_eq(eye[i, j], wp.float16(1.0))
            else:
                wp.expect_eq(eye[i, j], wp.float16(0.0))

            wp.expect_eq(zeros[i, j], wp.float16(0.0))
            wp.expect_eq(custom[i, j], wp.float16(i) * wp.float16(2.0) + wp.float16(j))


mat32d = wp.mat(shape=(3, 2), dtype=wp.float64)


@wp.kernel
def test_matrix_constructor_value_func():
    a = wp.mat22()
    b = wp.matrix(a, shape=(2, 2))
    c = mat32d()
    d = mat32d(c, shape=(3, 2))
    e = mat32d(wp.float64(1.0), wp.float64(2.0), wp.float64(1.0), wp.float64(2.0), wp.float64(1.0), wp.float64(2.0))
    f = mat32d(
        wp.vec3d(wp.float64(1.0), wp.float64(2.0), wp.float64(3.0)),
        wp.vec3d(wp.float64(1.0), wp.float64(2.0), wp.float64(3.0)),
    )


# Same as above but with a default (float/int) type
# which tests some different code paths that
# need to ensure types are correctly canonicalized
# during codegen
@wp.kernel
def test_constructors_default_precision():
    # construction for default (float) matrix types
    eye = wp.identity(dtype=float, n=2)
    zeros = wp.matrix(shape=(2, 2), dtype=float)
    custom = wp.matrix(0.0, 1.0, 2.0, 3.0, shape=(2, 2))

    for i in range(2):
        for j in range(2):
            if i == j:
                wp.expect_eq(eye[i, j], 1.0)
            else:
                wp.expect_eq(eye[i, j], 0.0)

            wp.expect_eq(zeros[i, j], 0.0)
            wp.expect_eq(custom[i, j], float(i) * 2.0 + float(j))


@wp.kernel
def test_matrix_mutation(expected: wp.types.matrix(shape=(10, 3), dtype=float)):
    m = wp.matrix(shape=(10, 3), dtype=float)

    # test direct element indexing
    m[0, 0] = 1.0
    m[0, 1] = 2.0
    m[0, 2] = 3.0

    # The nested indexing (matrix->vector->scalar) below does not
    # currently modify m because m[0] returns row vector by
    # value rather than reference, this is different from NumPy
    # which always returns by ref. Not clear how we can support
    # this as well as auto-diff.

    # m[0][1] = 2.0
    # m[0][2] = 3.0

    # test setting rows
    for i in range(1, 10):
        m[i] = m[i - 1] + wp.vec3(1.0, 2.0, 3.0)

    wp.expect_eq(m, expected)


CONSTANT_SHAPE_ROWS = wp.constant(10)
CONSTANT_SHAPE_COLS = wp.constant(10)


# tests that we can use global constants in shape keyword argument
# for matrix constructor
@wp.kernel
def test_constructors_constant_shape():
    m = wp.matrix(shape=(CONSTANT_SHAPE_ROWS, CONSTANT_SHAPE_COLS), dtype=float)

    for i in range(CONSTANT_SHAPE_ROWS):
        for j in range(CONSTANT_SHAPE_COLS):
            m[i, j] = float(i * j)


devices = get_test_devices()


class TestMat(unittest.TestCase):
    pass


add_kernel_test(TestMat, test_constructors_explicit_precision, dim=1, devices=devices)
add_kernel_test(TestMat, test_constructors_default_precision, dim=1, devices=devices)
add_kernel_test(TestMat, test_constructors_constant_shape, dim=1, devices=devices)
add_kernel_test(TestMat, test_matrix_constructor_value_func, dim=1, devices=devices)

mat103 = wp.types.matrix(shape=(10, 3), dtype=float)
add_kernel_test(
    TestMat,
    test_matrix_mutation,
    dim=1,
    inputs=[
        mat103(
            1.0,
            2.0,
            3.0,
            2.0,
            4.0,
            6.0,
            3.0,
            6.0,
            9.0,
            4.0,
            8.0,
            12.0,
            5.0,
            10.0,
            15.0,
            6.0,
            12.0,
            18.0,
            7.0,
            14.0,
            21.0,
            8.0,
            16.0,
            24.0,
            9.0,
            18.0,
            27.0,
            10.0,
            20.0,
            30.0,
        )
    ],
    devices=devices,
)

for dtype in np_signed_int_types + np_float_types:
    add_function_test_register_kernel(
        TestMat, f"test_negation_{dtype.__name__}", test_negation, devices=devices, dtype=dtype
    )
    add_function_test_register_kernel(
        TestMat, f"test_subtraction_{dtype.__name__}", test_subtraction, devices=devices, dtype=dtype
    )

add_function_test(
    TestMat,
    "test_anon_constructor_error_shape_keyword_missing",
    test_anon_constructor_error_shape_keyword_missing,
    devices=devices,
)
add_function_test(
    TestMat,
    "test_anon_constructor_error_dtype_keyword_missing",
    test_anon_constructor_error_dtype_keyword_missing,
    devices=devices,
)
add_function_test(
    TestMat,
    "test_anon_constructor_error_shape_mismatch",
    test_anon_constructor_error_shape_mismatch,
    devices=devices,
)
add_function_test(
    TestMat,
    "test_anon_constructor_error_invalid_arg_count",
    test_anon_constructor_error_invalid_arg_count,
    devices=devices,
)
add_function_test(
    TestMat,
    "test_tpl_constructor_error_incompatible_sizes",
    test_tpl_constructor_error_incompatible_sizes,
    devices=devices,
)
add_function_test(
    TestMat,
    "test_tpl_constructor_error_invalid_scalar_type",
    test_tpl_constructor_error_invalid_scalar_type,
    devices=devices,
)
add_function_test(
    TestMat,
    "test_tpl_constructor_error_invalid_vector_count",
    test_tpl_constructor_error_invalid_vector_count,
    devices=devices,
)
add_function_test(
    TestMat,
    "test_tpl_constructor_error_invalid_vector_shape",
    test_tpl_constructor_error_invalid_vector_shape,
    devices=devices,
)
add_function_test(
    TestMat,
    "test_tpl_constructor_error_invalid_arg_count",
    test_tpl_constructor_error_invalid_arg_count,
    devices=devices,
)
add_function_test(TestMat, "test_tpl_ops_with_anon", test_tpl_ops_with_anon)

for dtype in np_float_types:
    add_function_test(
        TestMat, f"test_py_arithmetic_ops_{dtype.__name__}", test_py_arithmetic_ops, devices=None, dtype=dtype
    )
    add_function_test_register_kernel(
        TestMat, f"test_quat_constructor_{dtype.__name__}", test_quat_constructor, devices=devices, dtype=dtype
    )
    add_function_test_register_kernel(
        TestMat, f"test_inverse_{dtype.__name__}", test_inverse, devices=devices, dtype=dtype
    )
    add_function_test_register_kernel(TestMat, f"test_svd_{dtype.__name__}", test_svd, devices=devices, dtype=dtype)
    add_function_test_register_kernel(TestMat, f"test_qr_{dtype.__name__}", test_qr, devices=devices, dtype=dtype)
    add_function_test_register_kernel(TestMat, f"test_eig_{dtype.__name__}", test_eig, devices=devices, dtype=dtype)
    add_function_test_register_kernel(
        TestMat, f"test_transform_point_{dtype.__name__}", test_transform_point, devices=devices, dtype=dtype
    )
    add_function_test_register_kernel(
        TestMat, f"test_transform_vector_{dtype.__name__}", test_transform_vector, devices=devices, dtype=dtype
    )
    add_function_test_register_kernel(
        TestMat, f"test_determinant_{dtype.__name__}", test_determinant, devices=devices, dtype=dtype
    )
    add_function_test_register_kernel(TestMat, f"test_skew_{dtype.__name__}", test_skew, devices=devices, dtype=dtype)


if __name__ == "__main__":
    wp.build.clear_kernel_cache()
    unittest.main(verbosity=2, failfast=True)