Spaces:

qbhf2
/

GarmentCode

Sleeping

App Files Files Community

GarmentCode / NvidiaWarp-GarmentCode /warp /tests /test_mat.py

qbhf2

added NvidiaWarp and GarmentCode repos

66c9c8a 11 months ago

raw

history blame contribute delete

65.4 kB

	# Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
	# NVIDIA CORPORATION and its licensors retain all intellectual property
	# and proprietary rights in and to this software, related documentation
	# and any modifications thereto. Any use, reproduction, disclosure or
	# distribution of this software and related documentation without an express
	# license agreement from NVIDIA CORPORATION is strictly prohibited.

	import unittest

	import numpy as np

	import warp as wp
	from warp.tests.unittest_utils import *

	wp.init()

	np_signed_int_types = [
	np.int8,
	np.int16,
	np.int32,
	np.int64,
	np.byte,
	]

	np_float_types = [np.float16, np.float32, np.float64]


	def randvals(rng, shape, dtype):
	if dtype in np_float_types:
	return rng.standard_normal(size=shape).astype(dtype)
	elif dtype in [np.int8, np.uint8, np.byte, np.ubyte]:
	return rng.integers(1, high=3, size=shape, dtype=dtype)
	return rng.integers(1, high=5, size=shape, dtype=dtype)


	kernel_cache = dict()


	def getkernel(func, suffix=""):
	key = func.__name__ + "_" + suffix
	if key not in kernel_cache:
	kernel_cache[key] = wp.Kernel(func=func, key=key)
	return kernel_cache[key]


	def get_select_kernel(dtype):
	def output_select_kernel_fn(
	input: wp.array(dtype=dtype),
	index: int,
	out: wp.array(dtype=dtype),
	):
	out[0] = input[index]

	return getkernel(output_select_kernel_fn, suffix=dtype.__name__)

	wp.launch(kernel, dim=1, inputs=[])


	def test_anon_constructor_error_shape_keyword_missing(test, device):
	@wp.kernel
	def kernel():
	wp.matrix(1.0, 2.0, 3.0)

	with test.assertRaisesRegex(
	RuntimeError,
	r"shape keyword must be specified when calling matrix function$",
	):
	wp.launch(
	kernel,
	dim=1,
	inputs=[],
	device=device,
	)


	def test_anon_constructor_error_dtype_keyword_missing(test, device):
	@wp.kernel
	def kernel():
	wp.matrix(shape=(3, 3))

	with test.assertRaisesRegex(
	RuntimeError,
	r"matrix must have dtype as a keyword argument if it has no " r"positional arguments$",
	):
	wp.launch(
	kernel,
	dim=1,
	inputs=[],
	device=device,
	)


	def test_anon_constructor_error_shape_mismatch(test, device):
	@wp.kernel
	def kernel():
	wp.matrix(
	wp.matrix(shape=(1, 2), dtype=float),
	shape=(3, 4),
	dtype=float,
	)

	with test.assertRaisesRegex(
	RuntimeError,
	r"Incompatible matrix sizes for casting copy constructor, " r"$3, 4$ vs $1, 2$$",
	):
	wp.launch(
	kernel,
	dim=1,
	inputs=[],
	device=device,
	)


	def test_anon_constructor_error_invalid_arg_count(test, device):
	@wp.kernel
	def kernel():
	wp.matrix(1.0, 2.0, 3.0, shape=(2, 2), dtype=float)

	with test.assertRaisesRegex(
	RuntimeError,
	r"Wrong number of arguments for matrix function, must initialize "
	r"with either a scalar value, or m\*n values$",
	):
	wp.launch(
	kernel,
	dim=1,
	inputs=[],
	device=device,
	)


	def test_tpl_constructor_error_incompatible_sizes(test, device):
	@wp.kernel
	def kernel():
	wp.mat33(wp.mat22(1.0, 2.0, 3.0, 4.0))

	with test.assertRaisesRegex(
	RuntimeError,
	r"Incompatible matrix sizes for casting copy constructor, " r"$3, 3$ vs $2, 2$$",
	):
	wp.launch(
	kernel,
	dim=1,
	inputs=[],
	device=device,
	)


	def test_tpl_constructor_error_invalid_scalar_type(test, device):
	@wp.kernel
	def kernel():
	wp.mat22(1, 2, 3, 4)

	with test.assertRaisesRegex(
	RuntimeError,
	r"Wrong scalar type for mat 2,2,<class 'warp.types.float32'> constructor$",
	):
	wp.launch(
	kernel,
	dim=1,
	inputs=[],
	device=device,
	)


	def test_tpl_constructor_error_invalid_vector_count(test, device):
	@wp.kernel
	def kernel():
	wp.mat22(wp.vec3(1.0, 2.0, 3.0))

	with test.assertRaisesRegex(
	RuntimeError,
	r"Wrong number of vectors when attempting to construct a matrix " r"with column vectors$",
	):
	wp.launch(
	kernel,
	dim=1,
	inputs=[],
	device=device,
	)


	def test_tpl_constructor_error_invalid_vector_shape(test, device):
	@wp.kernel
	def kernel():
	wp.mat22(wp.vec3(1.0, 2.0, 3.0), wp.vec3(4.0, 5.0, 6.0))

	with test.assertRaisesRegex(
	RuntimeError,
	r"Wrong vector row count when attempting to construct a matrix " r"with column vectors$",
	):
	wp.launch(
	kernel,
	dim=1,
	inputs=[],
	device=device,
	)


	def test_tpl_constructor_error_invalid_arg_count(test, device):
	@wp.kernel
	def kernel():
	wp.mat22(1.0, 2.0, 3.0)

	with test.assertRaisesRegex(
	RuntimeError,
	r"Wrong number of scalars when attempting to construct a matrix " r"from a list of components$",
	):
	wp.launch(
	kernel,
	dim=1,
	inputs=[],
	device=device,
	)


	def test_tpl_ops_with_anon(test, device):
	mat22f = wp.mat((2, 2), dtype=float)

	m = wp.mat22f(1.0, 2.0, 3.0, 4.0)
	m += mat22f(2.0, 3.0, 4.0, 5.0)
	m -= mat22f(3.0, 4.0, 5.0, 6.0)
	test.assertSequenceEqual(m, ((0.0, 1.0), (2.0, 3.0)))

	m = mat22f(1.0, 2.0, 3.0, 4.0)
	m += wp.mat22f(2.0, 3.0, 4.0, 5.0)
	m -= wp.mat22f(3.0, 4.0, 5.0, 6.0)
	test.assertSequenceEqual(m, ((0.0, 1.0), (2.0, 3.0)))


	def test_py_arithmetic_ops(test, device, dtype):
	wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]

	def make_mat(*args):
	if wptype in wp.types.int_types:
	# Cast to the correct integer type to simulate wrapping.
	return tuple(tuple(wptype._type_(x).value for x in row) for row in args)

	return args

	def make_vec(*args):
	if wptype in wp.types.int_types:
	# Cast to the correct integer type to simulate wrapping.
	return tuple(wptype._type_(x).value for x in args)

	return args

	mat_cls = wp.mat((3, 3), wptype)
	vec_cls = wp.vec(3, wptype)

	m = mat_cls(((-1, 2, 3), (4, -5, 6), (7, 8, -9)))
	test.assertSequenceEqual(+m, make_mat((-1, 2, 3), (4, -5, 6), (7, 8, -9)))
	test.assertSequenceEqual(-m, make_mat((1, -2, -3), (-4, 5, -6), (-7, -8, 9)))
	test.assertSequenceEqual(m + mat_cls((5, 5, 5) * 3), make_mat((4, 7, 8), (9, 0, 11), (12, 13, -4)))
	test.assertSequenceEqual(m - mat_cls((5, 5, 5) * 3), make_mat((-6, -3, -2), (-1, -10, 1), (2, 3, -14)))
	test.assertSequenceEqual(m * vec_cls(5, 5, 5), make_vec(20, 25, 30))
	test.assertSequenceEqual(m @ vec_cls(5, 5, 5), make_vec(20, 25, 30))
	test.assertSequenceEqual(vec_cls(5, 5, 5) * m, make_vec(50, 25, 0))
	test.assertSequenceEqual(vec_cls(5, 5, 5) @ m, make_vec(50, 25, 0))

	m = mat_cls(((2, 4, 6), (8, 10, 12), (14, 16, 18)))
	test.assertSequenceEqual(m * wptype(2), make_mat((4, 8, 12), (16, 20, 24), (28, 32, 36)))
	test.assertSequenceEqual(wptype(2) * m, make_mat((4, 8, 12), (16, 20, 24), (28, 32, 36)))
	test.assertSequenceEqual(m / wptype(2), make_mat((1, 2, 3), (4, 5, 6), (7, 8, 9)))
	test.assertSequenceEqual(wptype(5040) / m, make_mat((2520, 1260, 840), (630, 504, 420), (360, 315, 280)))
	test.assertSequenceEqual(m * vec_cls(5, 5, 5), make_vec(60, 150, 240))
	test.assertSequenceEqual(m @ vec_cls(5, 5, 5), make_vec(60, 150, 240))
	test.assertSequenceEqual(vec_cls(5, 5, 5) * m, make_vec(120, 150, 180))
	test.assertSequenceEqual(vec_cls(5, 5, 5) @ m, make_vec(120, 150, 180))


	def test_quat_constructor(test, device, dtype, register_kernels=False):
	rng = np.random.default_rng(123)

	tol = {
	np.float16: 1.0e-3,
	np.float32: 1.0e-6,
	np.float64: 1.0e-8,
	}.get(dtype, 0)

	wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
	mat44 = wp.types.matrix(shape=(4, 4), dtype=wptype)
	vec4 = wp.types.vector(length=4, dtype=wptype)
	vec3 = wp.types.vector(length=3, dtype=wptype)
	quat = wp.types.quaternion(dtype=wptype)

	output_select_kernel = get_select_kernel(wptype)

	def check_mat_quat_constructor(
	p: wp.array(dtype=vec3),
	r: wp.array(dtype=quat),
	s: wp.array(dtype=vec3),
	outcomponents: wp.array(dtype=wptype),
	outcomponents_alt: wp.array(dtype=wptype),
	):
	m = mat44(p[0], r[0], s[0])

	R = wp.transpose(wp.quat_to_matrix(r[0]))
	c0 = s[0][0] * R[0]
	c1 = s[0][1] * R[1]
	c2 = s[0][2] * R[2]
	m_alt = mat44(
	vec4(c0[0], c0[1], c0[2], wptype(0.0)),
	vec4(c1[0], c1[1], c1[2], wptype(0.0)),
	vec4(c2[0], c2[1], c2[2], wptype(0.0)),
	vec4(p[0][0], p[0][1], p[0][2], wptype(1.0)),
	)

	idx = 0
	for i in range(4):
	for j in range(4):
	outcomponents[idx] = m[i, j]
	outcomponents_alt[idx] = m_alt[i, j]
	idx = idx + 1

	kernel = getkernel(check_mat_quat_constructor, suffix=dtype.__name__)

	if register_kernels:
	return

	# translation:
	p = wp.array(rng.standard_normal(size=(1, 3)).astype(dtype), dtype=vec3, requires_grad=True, device=device)

	# generate a normalized quaternion for the rotation:
	r = rng.standard_normal(size=(1, 4))
	r /= np.linalg.norm(r)
	r = wp.array(r.astype(dtype), dtype=quat, requires_grad=True, device=device)

	# scale:
	s = wp.array(rng.standard_normal(size=(1, 3)).astype(dtype), dtype=vec3, requires_grad=True, device=device)

	# just going to generate the matrix using the constructor, then
	# more manually, and make sure the values/gradients are the same:
	outcomponents = wp.zeros(4 * 4, dtype=wptype, requires_grad=True, device=device)
	outcomponents_alt = wp.zeros(4 * 4, dtype=wptype, requires_grad=True, device=device)
	wp.launch(kernel, dim=1, inputs=[p, r, s], outputs=[outcomponents, outcomponents_alt], device=device)
	assert_np_equal(outcomponents.numpy(), outcomponents_alt.numpy(), tol=1.0e-6)

	idx = 0
	out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
	out_alt = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
	for i in range(4):
	for j in range(4):
	tape = wp.Tape()
	with tape:
	wp.launch(kernel, dim=1, inputs=[p, r, s], outputs=[outcomponents, outcomponents_alt], device=device)
	wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
	wp.launch(
	output_select_kernel, dim=1, inputs=[outcomponents_alt, idx], outputs=[out_alt], device=device
	)

	tape.backward(loss=out)
	p_grad = 1.0 * tape.gradients[p].numpy()[0]
	r_grad = 1.0 * tape.gradients[r].numpy()[0]
	s_grad = 1.0 * tape.gradients[s].numpy()[0]
	tape.zero()

	tape.backward(loss=out_alt)
	p_grad_alt = 1.0 * tape.gradients[p].numpy()[0]
	r_grad_alt = 1.0 * tape.gradients[r].numpy()[0]
	s_grad_alt = 1.0 * tape.gradients[s].numpy()[0]
	tape.zero()

	assert_np_equal(p_grad, p_grad_alt, tol=tol)
	assert_np_equal(r_grad, r_grad_alt, tol=tol)
	assert_np_equal(s_grad, s_grad_alt, tol=tol)

	idx = idx + 1


	def test_negation(test, device, dtype, register_kernels=False):
	rng = np.random.default_rng(123)

	tol = {
	np.float16: 1.0e-2,
	np.float32: 1.0e-6,
	np.float64: 1.0e-8,
	}.get(dtype, 0)

	wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
	mat22 = wp.types.matrix(shape=(2, 2), dtype=wptype)
	mat33 = wp.types.matrix(shape=(3, 3), dtype=wptype)
	mat44 = wp.types.matrix(shape=(4, 4), dtype=wptype)
	mat55 = wp.types.matrix(shape=(5, 5), dtype=wptype)

	output_select_kernel = get_select_kernel(wptype)

	def check_mat_negation(
	m2: wp.array(dtype=mat22),
	m3: wp.array(dtype=mat33),
	m4: wp.array(dtype=mat44),
	m5: wp.array(dtype=mat55),
	outcomponents: wp.array(dtype=wptype),
	):
	mat2 = -m2[0]
	mat3 = -m3[0]
	mat4 = -m4[0]
	mat5 = -m5[0]

	# multiply outputs by 2 so we've got something to backpropagate:
	idx = 0
	for i in range(2):
	for j in range(2):
	outcomponents[idx] = wptype(2) * mat2[i, j]
	idx = idx + 1

	for i in range(3):
	for j in range(3):
	outcomponents[idx] = wptype(2) * mat3[i, j]
	idx = idx + 1

	for i in range(4):
	for j in range(4):
	outcomponents[idx] = wptype(2) * mat4[i, j]
	idx = idx + 1

	for i in range(5):
	for j in range(5):
	outcomponents[idx] = wptype(2) * mat5[i, j]
	idx = idx + 1

	kernel = getkernel(check_mat_negation, suffix=dtype.__name__)

	if register_kernels:
	return

	m2 = wp.array(randvals(rng, [1, 2, 2], dtype), dtype=mat22, requires_grad=True, device=device)
	m3 = wp.array(randvals(rng, [1, 3, 3], dtype), dtype=mat33, requires_grad=True, device=device)
	m4 = wp.array(randvals(rng, [1, 4, 4], dtype), dtype=mat44, requires_grad=True, device=device)
	m5 = wp.array(randvals(rng, [1, 5, 5], dtype), dtype=mat55, requires_grad=True, device=device)
	outcomponents = wp.zeros(2 * 2 + 3 * 3 + 4 * 4 + 5 * 5, dtype=wptype, requires_grad=True, device=device)

	wp.launch(kernel, dim=1, inputs=[m2, m3, m4, m5], outputs=[outcomponents], device=device)

	assert_np_equal(outcomponents.numpy()[:4], -2 * m2.numpy().reshape(-1), tol=tol)
	assert_np_equal(outcomponents.numpy()[4:13], -2 * m3.numpy().reshape(-1), tol=tol)
	assert_np_equal(outcomponents.numpy()[13:29], -2 * m4.numpy().reshape(-1), tol=tol)
	assert_np_equal(outcomponents.numpy()[29:54], -2 * m5.numpy().reshape(-1), tol=tol)

	if dtype in np_float_types:
	idx = 0
	out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
	for dim, input in [(2, m2), (3, m3), (4, m4), (5, m5)]:
	for i in range(dim):
	for j in range(dim):
	tape = wp.Tape()
	with tape:
	wp.launch(kernel, dim=1, inputs=[m2, m3, m4, m5], outputs=[outcomponents], device=device)
	wp.launch(
	output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device
	)
	tape.backward(loss=out)
	expectedresult = np.zeros((dim, dim), dtype=dtype)
	expectedresult[i, j] = -2
	assert_np_equal(tape.gradients[input].numpy()[0], expectedresult)
	tape.zero()
	idx = idx + 1


	def test_subtraction(test, device, dtype, register_kernels=False):
	rng = np.random.default_rng(123)

	tol = {
	np.float16: 5.0e-3,
	np.float32: 1.0e-6,
	np.float64: 1.0e-8,
	}.get(dtype, 0)

	wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
	mat22 = wp.types.matrix(shape=(2, 2), dtype=wptype)
	mat33 = wp.types.matrix(shape=(3, 3), dtype=wptype)
	mat44 = wp.types.matrix(shape=(4, 4), dtype=wptype)
	mat55 = wp.types.matrix(shape=(5, 5), dtype=wptype)

	output_select_kernel = get_select_kernel(wptype)

	def check_mat_sub(
	s2: wp.array(dtype=mat22),
	s3: wp.array(dtype=mat33),
	s4: wp.array(dtype=mat44),
	s5: wp.array(dtype=mat55),
	v2: wp.array(dtype=mat22),
	v3: wp.array(dtype=mat33),
	v4: wp.array(dtype=mat44),
	v5: wp.array(dtype=mat55),
	outcomponents: wp.array(dtype=wptype),
	):
	v2result = v2[0] - s2[0]
	v3result = v3[0] - s3[0]
	v4result = v4[0] - s4[0]
	v5result = v5[0] - s5[0]

	# multiply outputs by 2 so we've got something to backpropagate:
	idx = 0
	for i in range(2):
	for j in range(2):
	outcomponents[idx] = wptype(2) * v2result[i, j]
	idx = idx + 1

	for i in range(3):
	for j in range(3):
	outcomponents[idx] = wptype(2) * v3result[i, j]
	idx = idx + 1

	for i in range(4):
	for j in range(4):
	outcomponents[idx] = wptype(2) * v4result[i, j]
	idx = idx + 1

	for i in range(5):
	for j in range(5):
	outcomponents[idx] = wptype(2) * v5result[i, j]
	idx = idx + 1

	kernel = getkernel(check_mat_sub, suffix=dtype.__name__)

	if register_kernels:
	return

	s2 = wp.array(randvals(rng, [1, 2, 2], dtype), dtype=mat22, requires_grad=True, device=device)
	s3 = wp.array(randvals(rng, [1, 3, 3], dtype), dtype=mat33, requires_grad=True, device=device)
	s4 = wp.array(randvals(rng, [1, 4, 4], dtype), dtype=mat44, requires_grad=True, device=device)
	s5 = wp.array(randvals(rng, [1, 5, 5], dtype), dtype=mat55, requires_grad=True, device=device)
	v2 = wp.array(randvals(rng, [1, 2, 2], dtype), dtype=mat22, requires_grad=True, device=device)
	v3 = wp.array(randvals(rng, [1, 3, 3], dtype), dtype=mat33, requires_grad=True, device=device)
	v4 = wp.array(randvals(rng, [1, 4, 4], dtype), dtype=mat44, requires_grad=True, device=device)
	v5 = wp.array(randvals(rng, [1, 5, 5], dtype), dtype=mat55, requires_grad=True, device=device)
	outcomponents = wp.zeros(2 * 2 + 3 * 3 + 4 * 4 + 5 * 5, dtype=wptype, requires_grad=True, device=device)

	wp.launch(
	kernel,
	dim=1,
	inputs=[
	s2,
	s3,
	s4,
	s5,
	v2,
	v3,
	v4,
	v5,
	],
	outputs=[outcomponents],
	device=device,
	)

	assert_np_equal(outcomponents.numpy()[:4], 2 * (v2.numpy() - s2.numpy()).reshape(-1), tol=tol)
	assert_np_equal(outcomponents.numpy()[4:13], 2 * (v3.numpy() - s3.numpy()).reshape(-1), tol=tol)
	assert_np_equal(outcomponents.numpy()[13:29], 2 * (v4.numpy() - s4.numpy()).reshape(-1), tol=tol)
	assert_np_equal(outcomponents.numpy()[29:54], 2 * (v5.numpy() - s5.numpy()).reshape(-1), tol=10 * tol)

	if dtype in np_float_types:
	idx = 0
	out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
	for dim, in1, in2 in [(2, s2, v2), (3, s3, v3), (4, s4, v4), (5, s5, v5)]:
	for i in range(dim):
	for j in range(dim):
	tape = wp.Tape()
	with tape:
	wp.launch(
	kernel,
	dim=1,
	inputs=[
	s2,
	s3,
	s4,
	s5,
	v2,
	v3,
	v4,
	v5,
	],
	outputs=[outcomponents],
	device=device,
	)
	wp.launch(
	output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device
	)
	tape.backward(loss=out)
	expectedresult = np.zeros((dim, dim), dtype=dtype)
	expectedresult[i, j] = 2
	assert_np_equal(tape.gradients[in2].numpy()[0], expectedresult, tol=10 * tol)
	expectedresult[i, j] = -2
	assert_np_equal(tape.gradients[in1].numpy()[0], expectedresult, tol=10 * tol)
	tape.zero()

	idx = idx + 1


	def test_determinant(test, device, dtype, register_kernels=False):
	rng = np.random.default_rng(123)

	tol = {
	np.float16: 5.0e-3,
	np.float32: 1.0e-6,
	np.float64: 1.0e-8,
	}.get(dtype, 0)

	wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
	mat22 = wp.types.matrix(shape=(2, 2), dtype=wptype)
	mat33 = wp.types.matrix(shape=(3, 3), dtype=wptype)
	mat44 = wp.types.matrix(shape=(4, 4), dtype=wptype)

	def check_mat_det(
	v2: wp.array(dtype=mat22),
	v3: wp.array(dtype=mat33),
	v4: wp.array(dtype=mat44),
	det2: wp.array(dtype=wptype),
	det3: wp.array(dtype=wptype),
	det4: wp.array(dtype=wptype),
	):
	# multiply outputs by 2 so we've got something to backpropagate:
	det2[0] = wptype(2) * wp.determinant(v2[0])
	det3[0] = wptype(2) * wp.determinant(v3[0])
	det4[0] = wptype(2) * wp.determinant(v4[0])

	kernel = getkernel(check_mat_det, suffix=dtype.__name__)
	if register_kernels:
	return

	v2 = wp.array(randvals(rng, [1, 2, 2], dtype), dtype=mat22, requires_grad=True, device=device)
	v3 = wp.array(randvals(rng, [1, 3, 3], dtype), dtype=mat33, requires_grad=True, device=device)
	v4 = wp.array(randvals(rng, [1, 4, 4], dtype), dtype=mat44, requires_grad=True, device=device)
	det2 = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
	det3 = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
	det4 = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)

	tape = wp.Tape()
	with tape:
	wp.launch(
	kernel,
	dim=1,
	inputs=[
	v2,
	v3,
	v4,
	],
	outputs=[
	det2,
	det3,
	det4,
	],
	device=device,
	)

	if dtype in np_float_types:
	assert_np_equal(det2.numpy()[0], 2 * np.linalg.det(v2.numpy()[0].astype(np.float64)), tol=100 * tol)
	assert_np_equal(det3.numpy()[0], 2 * np.linalg.det(v3.numpy()[0].astype(np.float64)), tol=100 * tol)
	assert_np_equal(det4.numpy()[0], 2 * np.linalg.det(v4.numpy()[0].astype(np.float64)), tol=420 * tol)
	else:
	assert_np_equal(det2.numpy()[0], 2 * np.around(np.linalg.det(v2.numpy()[0])).astype(int))
	assert_np_equal(det3.numpy()[0], 2 * np.around(np.linalg.det(v3.numpy()[0])).astype(int))
	assert_np_equal(det4.numpy()[0], 2 * np.around(np.linalg.det(v4.numpy()[0])).astype(int))

	if dtype in np_float_types:
	# determinant derivative formula is annoying so finite differences?
	tape.backward(loss=det2)
	v2grads = 1.0 * tape.gradients[v2].numpy()[0]
	tape.zero()

	tape.backward(loss=det3)
	v3grads = 1.0 * tape.gradients[v3].numpy()[0]
	tape.zero()

	tape.backward(loss=det4)
	v4grads = 1.0 * tape.gradients[v4].numpy()[0]
	tape.zero()

	# finite differences are also annoying hence the large tolerance...
	# absolute nightmare in float16 too innit...
	dx = 0.01 if dtype == np.float16 else 0.0001
	fdtol = 2.0e-1 if dtype == np.float16 else 2.0e-3
	for i in range(2):
	for j in range(2):
	v2test = v2.numpy()
	v2test[0, i, j] += dx
	wp.launch(
	kernel,
	dim=1,
	inputs=[
	wp.array(v2test, dtype=v2.dtype, requires_grad=True, device=device),
	v3,
	v4,
	],
	outputs=[
	det2,
	det3,
	det4,
	],
	device=device,
	)
	dplus = det2.numpy()[0]
	v2test[0, i, j] -= 2.0 * dx
	wp.launch(
	kernel,
	dim=1,
	inputs=[
	wp.array(v2test, dtype=v2.dtype, requires_grad=True, device=device),
	v3,
	v4,
	],
	outputs=[
	det2,
	det3,
	det4,
	],
	device=device,
	)
	dminus = det2.numpy()[0]
	assert_np_equal((dplus - dminus) / (2.0 * dx * dplus), v2grads[i, j] / dplus, tol=fdtol)

	for i in range(3):
	for j in range(3):
	v3test = v3.numpy()
	v3test[0, i, j] += dx
	wp.launch(
	kernel,
	dim=1,
	inputs=[
	v2,
	wp.array(v3test, dtype=v3.dtype, requires_grad=True, device=device),
	v4,
	],
	outputs=[
	det2,
	det3,
	det4,
	],
	device=device,
	)
	dplus = det3.numpy()[0]
	v3test[0, i, j] -= 2.0 * dx
	wp.launch(
	kernel,
	dim=1,
	inputs=[
	v2,
	wp.array(v3test, dtype=v3.dtype, requires_grad=True, device=device),
	v4,
	],
	outputs=[
	det2,
	det3,
	det4,
	],
	device=device,
	)
	dminus = det3.numpy()[0]
	assert_np_equal((dplus - dminus) / (2.0 * dx * dplus), v3grads[i, j] / dplus, tol=fdtol)

	for i in range(4):
	for j in range(4):
	v4test = v4.numpy()
	v4test[0, i, j] += dx
	wp.launch(
	kernel,
	dim=1,
	inputs=[
	v2,
	v3,
	wp.array(v4test, dtype=v4.dtype, requires_grad=True, device=device),
	],
	outputs=[
	det2,
	det3,
	det4,
	],
	device=device,
	)
	dplus = det4.numpy()[0]
	v4test[0, i, j] -= 2.0 * dx
	wp.launch(
	kernel,
	dim=1,
	inputs=[
	v2,
	v3,
	wp.array(v4test, dtype=v4.dtype, requires_grad=True, device=device),
	],
	outputs=[
	det2,
	det3,
	det4,
	],
	device=device,
	)
	dminus = det4.numpy()[0]
	assert_np_equal((dplus - dminus) / (2.0 * dx * dplus), v4grads[i, j] / dplus, tol=fdtol)


	# Unused. Why?
	# def test_get_diag(test, device, dtype, register_kernels=False):
	# tol = {
	# np.float16: 1.0e-3,
	# np.float32: 1.0e-6,
	# np.float64: 1.0e-8,
	# }.get(dtype, 0)
	#
	# wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
	# mat55 = wp.types.vector(shape=(5, 5), dtype=wptype)
	#
	# output_select_kernel = get_select_kernel(wptype)
	#
	# def check_mat_diag(
	# m55: wp.array(dtype=mat55),
	# outcomponents: wp.array(dtype=wptype),
	# ):
	# # multiply outputs by 2 so we've got something to backpropagate:
	# vec5result = wptype(2) * wp.get_diag(m55[0])
	#
	# idx = 0
	# for i in range(5):
	# outcomponents[idx] = vec5result[i]
	# idx = idx + 1
	#
	# kernel = getkernel(check_mat_diag, suffix=dtype.__name__)
	#
	# if register_kernels:
	# return
	#
	# m55 = wp.array(randvals((1, 5, 5), dtype), dtype=mat55, requires_grad=True, device=device)
	# outcomponents = wp.zeros(5, dtype=wptype, requires_grad=True, device=device)
	# out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
	#
	# wp.launch(kernel, dim=1, inputs=[m55], outputs=[outcomponents], device=device)
	#
	# assert_np_equal(outcomponents.numpy(), 2 * np.diag(m55.numpy()[0]), tol=tol)
	#
	# if dtype in np_float_types:
	# idx = 0
	# for i in range(5):
	# tape = wp.Tape()
	# with tape:
	# wp.launch(kernel, dim=1, inputs=[m55], outputs=[outcomponents], device=device)
	# wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
	# tape.backward(loss=out)
	# expectedresult = np.zeros((5, 5), dtype=dtype)
	# expectedresult[i, i] = 2
	# assert_np_equal(tape.gradients[m55].numpy()[0], expectedresult, tol=10 * tol)
	# tape.zero()
	#
	# idx = idx + 1


	def test_inverse(test, device, dtype, register_kernels=False):
	rng = np.random.default_rng(123)

	tol = {
	np.float16: 5.0e-2,
	np.float32: 1.0e-5,
	np.float64: 1.0e-8,
	}.get(dtype, 0)

	wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
	mat22 = wp.types.matrix(shape=(2, 2), dtype=wptype)
	mat33 = wp.types.matrix(shape=(3, 3), dtype=wptype)
	mat44 = wp.types.matrix(shape=(4, 4), dtype=wptype)

	output_select_kernel = get_select_kernel(wptype)

	def check_mat_inverse(
	m2: wp.array(dtype=mat22),
	m3: wp.array(dtype=mat33),
	m4: wp.array(dtype=mat44),
	outcomponents: wp.array(dtype=wptype),
	):
	m2result = wp.inverse(m2[0])
	m3result = wp.inverse(m3[0])
	m4result = wp.inverse(m4[0])

	# multiply outputs by 2 so we've got something to backpropagate:
	idx = 0
	for i in range(2):
	for j in range(2):
	outcomponents[idx] = wptype(2) * m2result[i, j]
	idx = idx + 1

	for i in range(3):
	for j in range(3):
	outcomponents[idx] = wptype(2) * m3result[i, j]
	idx = idx + 1

	for i in range(4):
	for j in range(4):
	outcomponents[idx] = wptype(2) * m4result[i, j]
	idx = idx + 1

	kernel = getkernel(check_mat_inverse, suffix=dtype.__name__)

	if register_kernels:
	return

	m2 = wp.array(
	2 * (randvals(rng, [1, 2, 2], dtype) + 0.2 * np.eye(2)), dtype=mat22, requires_grad=True, device=device
	)
	m3 = wp.array(
	2 * (randvals(rng, [1, 3, 3], dtype) + 0.2 * np.eye(3)), dtype=mat33, requires_grad=True, device=device
	)
	m4 = wp.array(
	2 * (randvals(rng, [1, 4, 4], dtype) + 0.2 * np.eye(4)), dtype=mat44, requires_grad=True, device=device
	)

	outcomponents = wp.zeros(2 * 2 + 3 * 3 + 4 * 4, dtype=wptype, requires_grad=True, device=device)
	out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)

	wp.launch(kernel, dim=1, inputs=[m2, m3, m4], outputs=[outcomponents], device=device)

	assert_np_equal(outcomponents.numpy()[:4], 2 * np.linalg.inv(m2.numpy()[0].astype(np.float64)), tol=tol)
	assert_np_equal(outcomponents.numpy()[4:13], 2 * np.linalg.inv(m3.numpy()[0].astype(np.float64)), tol=5 * tol)
	assert_np_equal(outcomponents.numpy()[13:], 2 * np.linalg.inv(m4.numpy()[0].astype(np.float64)), tol=5 * tol)

	if dtype in np_float_types:
	# check gradients:
	idx = 0
	out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
	for dim, input in [(2, m2), (3, m3), (4, m4)]:
	minv = np.linalg.inv(input.numpy()[0].astype(np.float64))
	for i in range(dim):
	for j in range(dim):
	tape = wp.Tape()
	with tape:
	wp.launch(kernel, dim=1, inputs=[m2, m3, m4], outputs=[outcomponents], device=device)
	wp.launch(
	output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device
	)
	tape.backward(loss=out)
	d = np.zeros((dim, dim))
	d[j, i] = 2
	assert_np_equal(
	tape.gradients[input].numpy()[0], -np.matmul(minv, np.matmul(d, minv)).T, tol=10 * tol
	)
	tape.zero()

	idx = idx + 1

	# let's check 2x2 using different formulae just for (in)sanity's sake:
	m = m2.numpy()[0]

	det = m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]
	expected = 2 * np.array([[m[1, 1], -m[0, 1]], [-m[1, 0], m[0, 0]]], dtype=dtype) / det
	assert_np_equal(expected, outcomponents.numpy()[:4], tol=tol)

	# 0,0 component is this:
	# 2 * m[1,1] / (m[0,0]m[1,1] - m[1,0] m[0,1])
	assert_np_equal(2 * m[1, 1] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]), outcomponents.numpy()[0], tol=tol)

	tape = wp.Tape()
	with tape:
	wp.launch(kernel, dim=1, inputs=[m2, m3, m4], outputs=[outcomponents], device=device)
	wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, 0], outputs=[out], device=device)

	if dtype in np_float_types:
	tape.backward(loss=out)
	g = tape.gradients[m2].numpy()[0]
	assert_np_equal(-2 * m[1, 1] * m[1, 1] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[0, 0], tol=tol)
	assert_np_equal(2 * m[1, 1] * m[0, 1] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[1, 0], tol=tol)
	assert_np_equal(-2 * m[0, 1] * m[1, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[1, 1], tol=tol)
	assert_np_equal(2 * m[1, 1] * m[1, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[0, 1], tol=tol)
	tape.zero()

	# 0,1 component is this:
	# -2 * m[0,1] / (m[0,0]m[1,1] - m[1,0] m[0,1])
	assert_np_equal(-2 * m[0, 1] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]), outcomponents.numpy()[1], tol=tol)

	tape = wp.Tape()
	with tape:
	wp.launch(kernel, dim=1, inputs=[m2, m3, m4], outputs=[outcomponents], device=device)
	wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, 1], outputs=[out], device=device)
	if dtype in np_float_types:
	tape.backward(loss=out)
	g = tape.gradients[m2].numpy()[0]
	assert_np_equal(2 * m[0, 1] * m[1, 1] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[0, 0], tol=tol)
	assert_np_equal(-2 * m[0, 1] * m[0, 1] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[1, 0], tol=tol)
	assert_np_equal(2 * m[0, 0] * m[0, 1] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[1, 1], tol=tol)
	assert_np_equal(-2 * m[1, 1] * m[0, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[0, 1], tol=tol)
	tape.zero()

	# 1,0 component is this:
	# -2 * m[1,0] / (m[0,0]m[1,1] - m[1,0] m[0,1])
	assert_np_equal(-2 * m[1, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]), outcomponents.numpy()[2], tol=tol)

	tape = wp.Tape()
	with tape:
	wp.launch(kernel, dim=1, inputs=[m2, m3, m4], outputs=[outcomponents], device=device)
	wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, 2], outputs=[out], device=device)

	if dtype in np_float_types:
	tape.backward(loss=out)
	g = tape.gradients[m2].numpy()[0]
	assert_np_equal(2 * m[1, 1] * m[1, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[0, 0], tol=tol)
	assert_np_equal(-2 * m[0, 0] * m[1, 1] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[1, 0], tol=tol)
	assert_np_equal(2 * m[0, 0] * m[1, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[1, 1], tol=tol)
	assert_np_equal(-2 * m[1, 0] * m[1, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[0, 1], tol=tol)
	tape.zero()

	# 1,1 component is this:
	# 2 * m[0,0] / (m[0,0]m[1,1] - m[1,0] m[0,1])
	assert_np_equal(2 * m[0, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]), outcomponents.numpy()[3], tol=tol)

	tape = wp.Tape()
	with tape:
	wp.launch(kernel, dim=1, inputs=[m2, m3, m4], outputs=[outcomponents], device=device)
	wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, 3], outputs=[out], device=device)

	if dtype in np_float_types:
	tape.backward(loss=out)
	g = tape.gradients[m2].numpy()[0]
	assert_np_equal(-2 * m[0, 1] * m[1, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[0, 0], tol=tol)
	assert_np_equal(2 * m[0, 0] * m[0, 1] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[1, 0], tol=tol)
	assert_np_equal(2 * m[0, 0] * m[1, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[0, 1], tol=tol)
	assert_np_equal(-2 * m[0, 0] * m[0, 0] / (m[0, 0] * m[1, 1] - m[1, 0] * m[0, 1]) ** 2, g[1, 1], tol=tol)
	tape.zero()


	def test_svd(test, device, dtype, register_kernels=False):
	rng = np.random.default_rng(123)

	tol = {
	np.float16: 1.0e-3,
	np.float32: 1.0e-6,
	np.float64: 1.0e-6,
	}.get(dtype, 0)

	wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
	vec3 = wp.types.vector(length=3, dtype=wptype)
	mat33 = wp.types.matrix(shape=(3, 3), dtype=wptype)

	def check_mat_svd(
	m3: wp.array(dtype=mat33),
	Uout: wp.array(dtype=mat33),
	sigmaout: wp.array(dtype=vec3),
	Vout: wp.array(dtype=mat33),
	outcomponents: wp.array(dtype=wptype),
	):
	U = mat33()
	sigma = vec3()
	V = mat33()

	wp.svd3(m3[0], U, sigma, V)

	Uout[0] = U
	sigmaout[0] = sigma
	Vout[0] = V

	# multiply outputs by 2 so we've got something to backpropagate:
	idx = 0
	for i in range(3):
	for j in range(3):
	outcomponents[idx] = wptype(2) * U[i, j]
	idx = idx + 1

	for i in range(3):
	outcomponents[idx] = wptype(2) * sigma[i]
	idx = idx + 1

	for i in range(3):
	for j in range(3):
	outcomponents[idx] = wptype(2) * V[i, j]
	idx = idx + 1

	kernel = getkernel(check_mat_svd, suffix=dtype.__name__)

	output_select_kernel = get_select_kernel(wptype)

	if register_kernels:
	return

	m3 = wp.array(randvals(rng, [1, 3, 3], dtype) + np.eye(3), dtype=mat33, requires_grad=True, device=device)

	outcomponents = wp.zeros(2 * 3 * 3 + 3, dtype=wptype, requires_grad=True, device=device)
	Uout = wp.zeros(1, dtype=mat33, requires_grad=True, device=device)
	sigmaout = wp.zeros(1, dtype=vec3, requires_grad=True, device=device)
	Vout = wp.zeros(1, dtype=mat33, requires_grad=True, device=device)

	wp.launch(kernel, dim=1, inputs=[m3], outputs=[Uout, sigmaout, Vout, outcomponents], device=device)

	Uout_np = Uout.numpy()[0].astype(np.float64)
	sigmaout_np = np.diag(sigmaout.numpy()[0].astype(np.float64))
	Vout_np = Vout.numpy()[0].astype(np.float64)

	assert_np_equal(
	np.matmul(Uout_np, np.matmul(sigmaout_np, Vout_np.T)), m3.numpy()[0].astype(np.float64), tol=30 * tol
	)

	if dtype == np.float16:
	# I'm not even going to bother testing the gradients for float16
	# because the rounding errors are terrible...
	return

	# check gradients:
	out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
	idx = 0
	for idx in range(3 * 3 + 3 + 3 * 3):
	tape = wp.Tape()
	with tape:
	wp.launch(kernel, dim=1, inputs=[m3], outputs=[Uout, sigmaout, Vout, outcomponents], device=device)
	wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
	tape.backward(out)
	m3grads = 1.0 * tape.gradients[m3].numpy()[0]

	tape.zero()

	dx = 0.0001
	fdtol = 5.0e-4 if dtype == np.float64 else 2.0e-2
	for ii in range(3):
	for jj in range(3):
	m3test = 1.0 * m3.numpy()
	m3test[0, ii, jj] += dx
	wp.launch(
	kernel,
	dim=1,
	inputs=[wp.array(m3test, dtype=mat33, device=device)],
	outputs=[Uout, sigmaout, Vout, outcomponents],
	device=device,
	)
	wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
	plusval = out.numpy()[0]

	m3test = 1.0 * m3.numpy()
	m3test[0, ii, jj] -= dx
	wp.launch(
	kernel,
	dim=1,
	inputs=[wp.array(m3test, dtype=mat33, device=device)],
	outputs=[Uout, sigmaout, Vout, outcomponents],
	device=device,
	)
	wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
	minusval = out.numpy()[0]

	assert_np_equal((plusval - minusval) / (2 * dx), m3grads[ii, jj], tol=fdtol)


	def test_qr(test, device, dtype, register_kernels=False):
	rng = np.random.default_rng(123)

	tol = {
	np.float16: 2.0e-3,
	np.float32: 1.0e-6,
	np.float64: 1.0e-6,
	}.get(dtype, 0)

	wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
	mat33 = wp.types.matrix(shape=(3, 3), dtype=wptype)

	def check_mat_qr(
	m3: wp.array(dtype=mat33),
	Qout: wp.array(dtype=mat33),
	Rout: wp.array(dtype=mat33),
	outcomponents: wp.array(dtype=wptype),
	):
	Q = mat33()
	R = mat33()

	wp.qr3(m3[0], Q, R)

	Qout[0] = Q
	Rout[0] = R

	# multiply outputs by 2 so we've got something to backpropagate:
	idx = 0
	for i in range(3):
	for j in range(3):
	outcomponents[idx] = wptype(2) * Q[i, j]
	idx = idx + 1

	for i in range(3):
	for j in range(3):
	outcomponents[idx] = wptype(2) * R[i, j]
	idx = idx + 1

	kernel = getkernel(check_mat_qr, suffix=dtype.__name__)
	output_select_kernel = get_select_kernel(wptype)

	if register_kernels:
	return

	m3 = wp.array(0.5 * (randvals(rng, [1, 3, 3], dtype) + np.eye(3)), dtype=mat33, requires_grad=True, device=device)

	outcomponents = wp.zeros(2 * 3 * 3, dtype=wptype, requires_grad=True, device=device)
	Qout = wp.zeros(1, dtype=mat33, requires_grad=True, device=device)
	Rout = wp.zeros(1, dtype=mat33, requires_grad=True, device=device)

	wp.launch(kernel, dim=1, inputs=[m3], outputs=[Qout, Rout, outcomponents], device=device)

	Qout_np = Qout.numpy()[0].astype(np.float64)
	Rout_np = Rout.numpy()[0].astype(np.float64)

	# check it's actually a q and an r:
	assert_np_equal(np.matmul(Qout_np.T, Qout_np), np.eye(3, dtype=np.float64), tol=tol)
	assert_np_equal(Rout_np[1, [0]], np.zeros(1, dtype=np.float64), tol=tol)
	assert_np_equal(Rout_np[2, [0, 1]], np.zeros(2, dtype=np.float64), tol=tol)

	# check it's a factorization:
	assert_np_equal(np.matmul(Qout_np, Rout_np), m3.numpy()[0].astype(np.float64), tol=30 * tol)

	if dtype == np.float16:
	# I'm not even going to bother testing the gradients for float16
	# because the rounding errors are terrible...
	return

	# check gradients:
	out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
	idx = 0
	for idx in range(len(outcomponents)):
	tape = wp.Tape()
	with tape:
	wp.launch(kernel, dim=1, inputs=[m3], outputs=[Qout, Rout, outcomponents], device=device)
	wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
	tape.backward(out)
	m3grads = 1.0 * tape.gradients[m3].numpy()[0]

	tape.zero()

	dx = 0.0001
	fdtol = 5.0e-4 if dtype == np.float64 else 2.0e-2
	for ii in range(3):
	for jj in range(3):
	m3test = 1.0 * m3.numpy()
	m3test[0, ii, jj] += dx
	wp.launch(
	kernel,
	dim=1,
	inputs=[wp.array(m3test, dtype=mat33, device=device)],
	outputs=[Qout, Rout, outcomponents],
	device=device,
	)
	wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
	plusval = out.numpy()[0]

	m3test = 1.0 * m3.numpy()
	m3test[0, ii, jj] -= dx
	wp.launch(
	kernel,
	dim=1,
	inputs=[wp.array(m3test, dtype=mat33, device=device)],
	outputs=[Qout, Rout, outcomponents],
	device=device,
	)
	wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
	minusval = out.numpy()[0]

	assert_np_equal((plusval - minusval) / (2 * dx), m3grads[ii, jj], tol=fdtol)


	def test_eig(test, device, dtype, register_kernels=False):
	rng = np.random.default_rng(123)

	tol = {
	np.float16: 4.0e-2,
	np.float32: 1.0e-5,
	np.float64: 1.0e-5,
	}.get(dtype, 0)

	wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
	vec3 = wp.types.vector(length=3, dtype=wptype)
	mat33 = wp.types.matrix(shape=(3, 3), dtype=wptype)

	def check_mat_eig(
	m3: wp.array(dtype=mat33),
	Qout: wp.array(dtype=mat33),
	dout: wp.array(dtype=vec3),
	outcomponents: wp.array(dtype=wptype),
	):
	Q = mat33()
	d = vec3()

	wp.eig3(m3[0] + wp.transpose(m3[0]), Q, d)

	Qout[0] = Q
	dout[0] = d

	# multiply outputs by 2 so we've got something to backpropagate:
	idx = 0
	for i in range(3):
	for j in range(3):
	outcomponents[idx] = wptype(2) * Q[i, j]
	idx = idx + 1

	for i in range(3):
	outcomponents[idx] = wptype(2) * d[i]
	idx = idx + 1

	kernel = getkernel(check_mat_eig, suffix=dtype.__name__)
	output_select_kernel = get_select_kernel(wptype)

	if register_kernels:
	return

	m3_np = randvals(rng, [1, 3, 3], dtype) + np.eye(3, dtype=dtype)
	m3 = wp.array(m3_np, dtype=mat33, requires_grad=True, device=device)

	outcomponents = wp.zeros(3 * 3 + 3, dtype=wptype, requires_grad=True, device=device)
	Qout = wp.zeros(1, dtype=mat33, requires_grad=True, device=device)
	dout = wp.zeros(1, dtype=vec3, requires_grad=True, device=device)

	wp.launch(kernel, dim=1, inputs=[m3], outputs=[Qout, dout, outcomponents], device=device)

	Qout_np = Qout.numpy()[0].astype(np.float64)
	dout_np = dout.numpy()[0].astype(np.float64)
	Dout_np = np.diag(dout_np)

	# check Q is orthogonal:
	assert_np_equal(np.matmul(Qout_np.T, Qout_np), np.eye(3), tol=tol)

	# check Q contains eigenvectors:
	assert_np_equal(np.matmul(Qout_np, np.matmul(Dout_np, Qout_np.T)), (m3_np[0] + m3_np[0].transpose()), tol=tol)

	if dtype == np.float16:
	# I'm not even going to bother testing the gradients for float16
	# because the rounding errors are terrible...
	return

	# check gradients:
	out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
	idx = 0
	for idx in range(len(outcomponents)):
	tape = wp.Tape()
	with tape:
	wp.launch(kernel, dim=1, inputs=[m3], outputs=[Qout, dout, outcomponents], device=device)
	wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
	tape.backward(out)
	m3grads = 1.0 * tape.gradients[m3].numpy()[0]

	tape.zero()

	dx = 0.0001
	fdtol = 5.0e-4 if dtype == np.float64 else 2.0e-2
	for ii in range(3):
	for jj in range(3):
	m3test = 1.0 * m3.numpy()
	m3test[0, ii, jj] += dx
	wp.launch(
	kernel,
	dim=1,
	inputs=[wp.array(m3test, dtype=mat33, device=device)],
	outputs=[Qout, dout, outcomponents],
	device=device,
	)
	wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
	plusval = out.numpy()[0]

	m3test = 1.0 * m3.numpy()
	m3test[0, ii, jj] -= dx
	wp.launch(
	kernel,
	dim=1,
	inputs=[wp.array(m3test, dtype=mat33, device=device)],
	outputs=[Qout, dout, outcomponents],
	device=device,
	)
	wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
	minusval = out.numpy()[0]

	assert_np_equal((plusval - minusval) / (2 * dx), m3grads[ii, jj], tol=fdtol)


	def test_skew(test, device, dtype, register_kernels=False):
	rng = np.random.default_rng(123)

	tol = {
	np.float16: 1.0e-3,
	np.float32: 1.0e-6,
	np.float64: 1.0e-8,
	}.get(dtype, 0)

	wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
	vec3 = wp.types.vector(length=3, dtype=wptype)

	output_select_kernel = get_select_kernel(wptype)

	def check_mat_skew(
	v3: wp.array(dtype=vec3),
	outcomponents: wp.array(dtype=wptype),
	):
	m3result = wp.skew(v3[0])

	# multiply outputs by 2 so we've got something to backpropagate:
	idx = 0
	for i in range(3):
	for j in range(3):
	outcomponents[idx] = wptype(2) * m3result[i, j]
	idx = idx + 1

	kernel = getkernel(check_mat_skew, suffix=dtype.__name__)

	if register_kernels:
	return

	v3 = wp.array(randvals(rng, [1, 3], dtype), dtype=vec3, requires_grad=True, device=device)

	outcomponents = wp.zeros(3 * 3, dtype=wptype, requires_grad=True, device=device)
	out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)

	wp.launch(kernel, dim=1, inputs=[v3], outputs=[outcomponents], device=device)

	# make sure it gives you a cross product matrix:
	crossprodmat = outcomponents.numpy().reshape(3, 3)
	v = np.array([1, 0, 0])
	assert_np_equal(
	np.matmul(crossprodmat, np.array([1, 0, 0])).reshape(-1),
	2 * np.cross(v3.numpy()[0], np.array([1, 0, 0])),
	tol=tol,
	)
	assert_np_equal(
	np.matmul(crossprodmat, np.array([0, 1, 0])).reshape(-1),
	2 * np.cross(v3.numpy()[0], np.array([0, 1, 0])),
	tol=tol,
	)
	assert_np_equal(
	np.matmul(crossprodmat, np.array([0, 0, 1])).reshape(-1),
	2 * np.cross(v3.numpy()[0], np.array([0, 0, 1])),
	tol=tol,
	)

	# check it another way:
	x0 = v3.numpy()[0, 0]
	x1 = v3.numpy()[0, 1]
	x2 = v3.numpy()[0, 2]
	crossprodmat_expected = np.array(
	[
	[0, -x2, x1],
	[x2, 0, -x0],
	[-x1, x0, 0],
	],
	dtype=dtype,
	)
	assert_np_equal(crossprodmat, 2 * crossprodmat_expected, tol=tol)

	if dtype in np_float_types:
	idx = 0
	out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)

	for i in range(3):
	for j in range(3):
	tape = wp.Tape()
	with tape:
	wp.launch(kernel, dim=1, inputs=[v3], outputs=[outcomponents], device=device)
	wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
	tape.backward(loss=out)
	if i == j:
	assert_np_equal(tape.gradients[v3].numpy()[0], np.zeros(3))
	elif [i, j] == [0, 1]:
	assert_np_equal(tape.gradients[v3].numpy()[0], np.array([0, 0, -2]))
	elif [i, j] == [1, 0]:
	assert_np_equal(tape.gradients[v3].numpy()[0], np.array([0, 0, 2]))
	elif [i, j] == [0, 2]:
	assert_np_equal(tape.gradients[v3].numpy()[0], np.array([0, 2, 0]))
	elif [i, j] == [2, 0]:
	assert_np_equal(tape.gradients[v3].numpy()[0], np.array([0, -2, 0]))
	elif [i, j] == [1, 2]:
	assert_np_equal(tape.gradients[v3].numpy()[0], np.array([-2, 0, 0]))
	elif [i, j] == [2, 1]:
	assert_np_equal(tape.gradients[v3].numpy()[0], np.array([2, 0, 0]))
	tape.zero()

	idx = idx + 1


	def test_transform_point(test, device, dtype, register_kernels=False):
	rng = np.random.default_rng(123)

	tol = {
	np.float16: 5.0e-3,
	np.float32: 1.0e-6,
	np.float64: 1.0e-8,
	}.get(dtype, 0)

	wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
	vec3 = wp.types.vector(length=3, dtype=wptype)
	mat44 = wp.types.matrix(shape=(4, 4), dtype=wptype)

	output_select_kernel = get_select_kernel(wptype)

	def check_mat_transform_point(
	v3: wp.array(dtype=vec3),
	m4: wp.array(dtype=mat44),
	outcomponents: wp.array(dtype=wptype),
	):
	# multiply outputs by 2 so we've got something to backpropagate:
	presult = wptype(2) * wp.transform_point(m4[0], v3[0])

	outcomponents[0] = presult[0]
	outcomponents[1] = presult[1]
	outcomponents[2] = presult[2]

	kernel = getkernel(check_mat_transform_point, suffix=dtype.__name__)

	if register_kernels:
	return

	v3 = wp.array(randvals(rng, [1, 3], dtype), dtype=vec3, requires_grad=True, device=device)
	m4 = wp.array(randvals(rng, [1, 4, 4], dtype), dtype=mat44, requires_grad=True, device=device)

	outcomponents = wp.zeros(3, dtype=wptype, requires_grad=True, device=device)
	out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)

	wp.launch(kernel, dim=1, inputs=[v3, m4], outputs=[outcomponents], device=device)

	v3homog = np.ones(4, dtype=dtype)
	v3homog[:3] = v3.numpy()[0]
	assert_np_equal(outcomponents.numpy(), 2 * np.matmul(m4.numpy()[0], v3homog)[:3], tol=10 * tol)

	if dtype in np_float_types:
	for j in range(3):
	tape = wp.Tape()
	with tape:
	wp.launch(kernel, dim=1, inputs=[v3, m4], outputs=[outcomponents], device=device)
	wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, j], outputs=[out], device=device)
	tape.backward(loss=out)

	assert_np_equal(2 * m4.numpy()[0, j, :3], tape.gradients[v3].numpy(), tol=tol)
	expected = np.zeros((4, 4), dtype=dtype)
	expected[j, :3] = 2 * v3.numpy()
	expected[j, 3] = 2
	assert_np_equal(tape.gradients[m4].numpy(), expected, tol=tol)

	tape.zero()


	def test_transform_vector(test, device, dtype, register_kernels=False):
	rng = np.random.default_rng(123)

	tol = {
	np.float16: 5.0e-3,
	np.float32: 1.0e-6,
	np.float64: 1.0e-8,
	}.get(dtype, 0)

	wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
	vec3 = wp.types.vector(length=3, dtype=wptype)
	mat44 = wp.types.matrix(shape=(4, 4), dtype=wptype)

	output_select_kernel = get_select_kernel(wptype)

	def check_mat_transform_vector(
	v3: wp.array(dtype=vec3),
	m4: wp.array(dtype=mat44),
	outcomponents: wp.array(dtype=wptype),
	):
	# multiply outputs by 2 so we've got something to backpropagate:
	presult = wptype(2) * wp.transform_vector(m4[0], v3[0])

	outcomponents[0] = presult[0]
	outcomponents[1] = presult[1]
	outcomponents[2] = presult[2]

	kernel = getkernel(check_mat_transform_vector, suffix=dtype.__name__)

	if register_kernels:
	return

	v3 = wp.array(randvals(rng, [1, 3], dtype), dtype=vec3, requires_grad=True, device=device)
	m4 = wp.array(randvals(rng, [1, 4, 4], dtype), dtype=mat44, requires_grad=True, device=device)

	outcomponents = wp.zeros(3, dtype=wptype, requires_grad=True, device=device)
	out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)

	wp.launch(kernel, dim=1, inputs=[v3, m4], outputs=[outcomponents], device=device)

	v3homog = np.zeros(4, dtype=dtype)
	v3homog[:3] = v3.numpy()[0]
	assert_np_equal(outcomponents.numpy(), 2 * np.matmul(m4.numpy()[0], v3homog)[:3], tol=10 * tol)

	if dtype in np_float_types:
	for j in range(3):
	tape = wp.Tape()
	with tape:
	wp.launch(kernel, dim=1, inputs=[v3, m4], outputs=[outcomponents], device=device)
	wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, j], outputs=[out], device=device)
	tape.backward(loss=out)

	assert_np_equal(2 * m4.numpy()[0, j, :3], tape.gradients[v3].numpy(), tol=tol)
	expected = np.zeros((4, 4), dtype=dtype)
	expected[j, :3] = 2 * v3.numpy()
	assert_np_equal(tape.gradients[m4].numpy(), expected, tol=tol)

	tape.zero()


	# Test matrix constructors using explicit type (float16)
	# note that these tests are specifically not using generics / closure
	# args to create kernels dynamically (like the rest of this file)
	# as those use different code paths to resolve arg types which
	# has lead to regressions.
	@wp.kernel
	def test_constructors_explicit_precision():
	# construction for custom matrix types
	eye = wp.identity(dtype=wp.float16, n=2)
	zeros = wp.matrix(shape=(2, 2), dtype=wp.float16)
	custom = wp.matrix(wp.float16(0.0), wp.float16(1.0), wp.float16(2.0), wp.float16(3.0), shape=(2, 2))

	for i in range(2):
	for j in range(2):
	if i == j:
	wp.expect_eq(eye[i, j], wp.float16(1.0))
	else:
	wp.expect_eq(eye[i, j], wp.float16(0.0))

	wp.expect_eq(zeros[i, j], wp.float16(0.0))
	wp.expect_eq(custom[i, j], wp.float16(i) * wp.float16(2.0) + wp.float16(j))


	mat32d = wp.mat(shape=(3, 2), dtype=wp.float64)


	@wp.kernel
	def test_matrix_constructor_value_func():
	a = wp.mat22()
	b = wp.matrix(a, shape=(2, 2))
	c = mat32d()
	d = mat32d(c, shape=(3, 2))
	e = mat32d(wp.float64(1.0), wp.float64(2.0), wp.float64(1.0), wp.float64(2.0), wp.float64(1.0), wp.float64(2.0))
	f = mat32d(
	wp.vec3d(wp.float64(1.0), wp.float64(2.0), wp.float64(3.0)),
	wp.vec3d(wp.float64(1.0), wp.float64(2.0), wp.float64(3.0)),
	)


	# Same as above but with a default (float/int) type
	# which tests some different code paths that
	# need to ensure types are correctly canonicalized
	# during codegen
	@wp.kernel
	def test_constructors_default_precision():
	# construction for default (float) matrix types
	eye = wp.identity(dtype=float, n=2)
	zeros = wp.matrix(shape=(2, 2), dtype=float)
	custom = wp.matrix(0.0, 1.0, 2.0, 3.0, shape=(2, 2))

	for i in range(2):
	for j in range(2):
	if i == j:
	wp.expect_eq(eye[i, j], 1.0)
	else:
	wp.expect_eq(eye[i, j], 0.0)

	wp.expect_eq(zeros[i, j], 0.0)
	wp.expect_eq(custom[i, j], float(i) * 2.0 + float(j))


	@wp.kernel
	def test_matrix_mutation(expected: wp.types.matrix(shape=(10, 3), dtype=float)):
	m = wp.matrix(shape=(10, 3), dtype=float)

	# test direct element indexing
	m[0, 0] = 1.0
	m[0, 1] = 2.0
	m[0, 2] = 3.0

	# The nested indexing (matrix->vector->scalar) below does not
	# currently modify m because m[0] returns row vector by
	# value rather than reference, this is different from NumPy
	# which always returns by ref. Not clear how we can support
	# this as well as auto-diff.

	# m[0][1] = 2.0
	# m[0][2] = 3.0

	# test setting rows
	for i in range(1, 10):
	m[i] = m[i - 1] + wp.vec3(1.0, 2.0, 3.0)

	wp.expect_eq(m, expected)


	CONSTANT_SHAPE_ROWS = wp.constant(10)
	CONSTANT_SHAPE_COLS = wp.constant(10)


	# tests that we can use global constants in shape keyword argument
	# for matrix constructor
	@wp.kernel
	def test_constructors_constant_shape():
	m = wp.matrix(shape=(CONSTANT_SHAPE_ROWS, CONSTANT_SHAPE_COLS), dtype=float)

	for i in range(CONSTANT_SHAPE_ROWS):
	for j in range(CONSTANT_SHAPE_COLS):
	m[i, j] = float(i * j)


	devices = get_test_devices()


	class TestMat(unittest.TestCase):
	pass


	add_kernel_test(TestMat, test_constructors_explicit_precision, dim=1, devices=devices)
	add_kernel_test(TestMat, test_constructors_default_precision, dim=1, devices=devices)
	add_kernel_test(TestMat, test_constructors_constant_shape, dim=1, devices=devices)
	add_kernel_test(TestMat, test_matrix_constructor_value_func, dim=1, devices=devices)

	mat103 = wp.types.matrix(shape=(10, 3), dtype=float)
	add_kernel_test(
	TestMat,
	test_matrix_mutation,
	dim=1,
	inputs=[
	mat103(
	1.0,
	2.0,
	3.0,
	2.0,
	4.0,
	6.0,
	3.0,
	6.0,
	9.0,
	4.0,
	8.0,
	12.0,
	5.0,
	10.0,
	15.0,
	6.0,
	12.0,
	18.0,
	7.0,
	14.0,
	21.0,
	8.0,
	16.0,
	24.0,
	9.0,
	18.0,
	27.0,
	10.0,
	20.0,
	30.0,
	)
	],
	devices=devices,
	)

	for dtype in np_signed_int_types + np_float_types:
	add_function_test_register_kernel(
	TestMat, f"test_negation_{dtype.__name__}", test_negation, devices=devices, dtype=dtype
	)
	add_function_test_register_kernel(
	TestMat, f"test_subtraction_{dtype.__name__}", test_subtraction, devices=devices, dtype=dtype
	)

	add_function_test(
	TestMat,
	"test_anon_constructor_error_shape_keyword_missing",
	test_anon_constructor_error_shape_keyword_missing,
	devices=devices,
	)
	add_function_test(
	TestMat,
	"test_anon_constructor_error_dtype_keyword_missing",
	test_anon_constructor_error_dtype_keyword_missing,
	devices=devices,
	)
	add_function_test(
	TestMat,
	"test_anon_constructor_error_shape_mismatch",
	test_anon_constructor_error_shape_mismatch,
	devices=devices,
	)
	add_function_test(
	TestMat,
	"test_anon_constructor_error_invalid_arg_count",
	test_anon_constructor_error_invalid_arg_count,
	devices=devices,
	)
	add_function_test(
	TestMat,
	"test_tpl_constructor_error_incompatible_sizes",
	test_tpl_constructor_error_incompatible_sizes,
	devices=devices,
	)
	add_function_test(
	TestMat,
	"test_tpl_constructor_error_invalid_scalar_type",
	test_tpl_constructor_error_invalid_scalar_type,
	devices=devices,
	)
	add_function_test(
	TestMat,
	"test_tpl_constructor_error_invalid_vector_count",
	test_tpl_constructor_error_invalid_vector_count,
	devices=devices,
	)
	add_function_test(
	TestMat,
	"test_tpl_constructor_error_invalid_vector_shape",
	test_tpl_constructor_error_invalid_vector_shape,
	devices=devices,
	)
	add_function_test(
	TestMat,
	"test_tpl_constructor_error_invalid_arg_count",
	test_tpl_constructor_error_invalid_arg_count,
	devices=devices,
	)
	add_function_test(TestMat, "test_tpl_ops_with_anon", test_tpl_ops_with_anon)

	for dtype in np_float_types:
	add_function_test(
	TestMat, f"test_py_arithmetic_ops_{dtype.__name__}", test_py_arithmetic_ops, devices=None, dtype=dtype
	)
	add_function_test_register_kernel(
	TestMat, f"test_quat_constructor_{dtype.__name__}", test_quat_constructor, devices=devices, dtype=dtype
	)
	add_function_test_register_kernel(
	TestMat, f"test_inverse_{dtype.__name__}", test_inverse, devices=devices, dtype=dtype
	)
	add_function_test_register_kernel(TestMat, f"test_svd_{dtype.__name__}", test_svd, devices=devices, dtype=dtype)
	add_function_test_register_kernel(TestMat, f"test_qr_{dtype.__name__}", test_qr, devices=devices, dtype=dtype)
	add_function_test_register_kernel(TestMat, f"test_eig_{dtype.__name__}", test_eig, devices=devices, dtype=dtype)
	add_function_test_register_kernel(
	TestMat, f"test_transform_point_{dtype.__name__}", test_transform_point, devices=devices, dtype=dtype
	)
	add_function_test_register_kernel(
	TestMat, f"test_transform_vector_{dtype.__name__}", test_transform_vector, devices=devices, dtype=dtype
	)
	add_function_test_register_kernel(
	TestMat, f"test_determinant_{dtype.__name__}", test_determinant, devices=devices, dtype=dtype
	)
	add_function_test_register_kernel(TestMat, f"test_skew_{dtype.__name__}", test_skew, devices=devices, dtype=dtype)


	if __name__ == "__main__":
	wp.build.clear_kernel_cache()
	unittest.main(verbosity=2, failfast=True)