# Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

from __future__ import annotations

import builtins
import ctypes
import hashlib
import inspect
import struct
import zlib
from typing import Any, Callable, Generic, List, Tuple, TypeVar, Union

import numpy as np

import warp

# type hints
Length = TypeVar("Length", bound=int)
Rows = TypeVar("Rows")
Cols = TypeVar("Cols")
DType = TypeVar("DType")

Int = TypeVar("Int")
Float = TypeVar("Float")
Scalar = TypeVar("Scalar")
Vector = Generic[Length, Scalar]
Matrix = Generic[Rows, Cols, Scalar]
Quaternion = Generic[Float]
Transformation = Generic[Float]

DType = TypeVar("DType")
Array = Generic[DType]

T = TypeVar("T")

# shared hash for all constants
_constant_hash = hashlib.sha256()


def constant(x):
    """Function to declare compile-time constants accessible from Warp kernels

    Args:
        x: Compile-time constant value, can be any of the built-in math types.
    """

    global _constant_hash

    # hash the constant value
    if isinstance(x, builtins.bool):
        # This needs to come before the check for `int` since all boolean
        # values are also instances of `int`.
        _constant_hash.update(struct.pack("?", x))
    elif isinstance(x, int):
        _constant_hash.update(struct.pack("<q", x))
    elif isinstance(x, float):
        _constant_hash.update(struct.pack("<d", x))
    elif isinstance(x, float16):
        # float16 is a special case
        p = ctypes.pointer(ctypes.c_float(x.value))
        _constant_hash.update(p.contents)
    elif isinstance(x, tuple(scalar_types)):
        p = ctypes.pointer(x._type_(x.value))
        _constant_hash.update(p.contents)
    elif isinstance(x, ctypes.Array):
        _constant_hash.update(bytes(x))
    else:
        raise RuntimeError(f"Invalid constant type: {type(x)}")

    return x


def float_to_half_bits(value):
    return warp.context.runtime.core.float_to_half_bits(value)


def half_bits_to_float(value):
    return warp.context.runtime.core.half_bits_to_float(value)


# ----------------------
# built-in types


def vector(length, dtype):
    # canonicalize dtype
    if dtype == int:
        dtype = int32
    elif dtype == float:
        dtype = float32

    class vec_t(ctypes.Array):
        # ctypes.Array data for length, shape and c type:
        _length_ = 0 if length is Any else length
        _shape_ = (_length_,)
        _type_ = ctypes.c_float if dtype in [Scalar, Float] else dtype._type_

        # warp scalar type:
        _wp_scalar_type_ = dtype
        _wp_type_params_ = [length, dtype]
        _wp_generic_type_str_ = "vec_t"
        _wp_constructor_ = "vector"

        # special handling for float16 type: in this case, data is stored
        # as uint16 but it's actually half precision floating point
        # data. This means we need to convert each of the arguments
        # to uint16s containing half float bits before storing them in
        # the array:
        scalar_import = float_to_half_bits if _wp_scalar_type_ == float16 else lambda x: x
        scalar_export = half_bits_to_float if _wp_scalar_type_ == float16 else lambda x: x

        def __init__(self, *args):
            num_args = len(args)
            if num_args == 0:
                super().__init__()
            elif num_args == 1:
                if hasattr(args[0], "__len__"):
                    # try to copy from expanded sequence, e.g. (1, 2, 3)
                    self.__init__(*args[0])
                else:
                    # set all elements to the same value
                    value = vec_t.scalar_import(args[0])
                    for i in range(self._length_):
                        super().__setitem__(i, value)
            elif num_args == self._length_:
                # set all scalar elements
                for i in range(self._length_):
                    super().__setitem__(i, vec_t.scalar_import(args[i]))
            else:
                raise ValueError(
                    f"Invalid number of arguments in vector constructor, expected {self._length_} elements, got {num_args}"
                )

        def __getitem__(self, key):
            if isinstance(key, int):
                return vec_t.scalar_export(super().__getitem__(key))
            elif isinstance(key, slice):
                if self._wp_scalar_type_ == float16:
                    return [vec_t.scalar_export(x) for x in super().__getitem__(key)]
                else:
                    return super().__getitem__(key)
            else:
                raise KeyError(f"Invalid key {key}, expected int or slice")

        def __setitem__(self, key, value):
            if isinstance(key, int):
                super().__setitem__(key, vec_t.scalar_import(value))
                return value
            elif isinstance(key, slice):
                if self._wp_scalar_type_ == float16:
                    super().__setitem__(key, [vec_t.scalar_import(x) for x in value])
                    return value
                else:
                    return super().__setitem__(key, value)
            else:
                raise KeyError(f"Invalid key {key}, expected int or slice")

        def __getattr__(self, name):
            idx = "xyzw".find(name)
            if idx != -1:
                return self.__getitem__(idx)

            return self.__getattribute__(name)

        def __setattr__(self, name, value):
            idx = "xyzw".find(name)
            if idx != -1:
                return self.__setitem__(idx, value)

            return super().__setattr__(name, value)

        def __add__(self, y):
            return warp.add(self, y)

        def __radd__(self, y):
            return warp.add(y, self)

        def __sub__(self, y):
            return warp.sub(self, y)

        def __rsub__(self, y):
            return warp.sub(y, self)

        def __mul__(self, y):
            return warp.mul(self, y)

        def __rmul__(self, x):
            return warp.mul(x, self)

        def __truediv__(self, y):
            return warp.div(self, y)

        def __rtruediv__(self, x):
            return warp.div(x, self)

        def __pos__(self):
            return warp.pos(self)

        def __neg__(self):
            return warp.neg(self)

        def __str__(self):
            return f"[{', '.join(map(str, self))}]"

        def __eq__(self, other):
            for i in range(self._length_):
                if self[i] != other[i]:
                    return False
            return True

        @classmethod
        def from_ptr(cls, ptr):
            if ptr:
                # create a new vector instance and initialize the contents from the binary data
                # this skips float16 conversions, assuming that float16 data is already encoded as uint16
                value = cls()
                ctypes.memmove(ctypes.byref(value), ptr, ctypes.sizeof(cls._type_) * cls._length_)
                return value
            else:
                raise RuntimeError("NULL pointer exception")

    return vec_t


def matrix(shape, dtype):
    assert len(shape) == 2

    # canonicalize dtype
    if dtype == int:
        dtype = int32
    elif dtype == float:
        dtype = float32

    class mat_t(ctypes.Array):
        _length_ = 0 if shape[0] == Any or shape[1] == Any else shape[0] * shape[1]
        _shape_ = (0, 0) if _length_ == 0 else shape
        _type_ = ctypes.c_float if dtype in [Scalar, Float] else dtype._type_

        # warp scalar type:
        # used in type checking and when writing out c++ code for constructors:
        _wp_scalar_type_ = dtype
        _wp_type_params_ = [shape[0], shape[1], dtype]
        _wp_generic_type_str_ = "mat_t"
        _wp_constructor_ = "matrix"

        _wp_row_type_ = vector(0 if shape[1] == Any else shape[1], dtype)

        # special handling for float16 type: in this case, data is stored
        # as uint16 but it's actually half precision floating point
        # data. This means we need to convert each of the arguments
        # to uint16s containing half float bits before storing them in
        # the array:
        scalar_import = float_to_half_bits if _wp_scalar_type_ == float16 else lambda x: x
        scalar_export = half_bits_to_float if _wp_scalar_type_ == float16 else lambda x: x

        def __init__(self, *args):
            num_args = len(args)
            if num_args == 0:
                super().__init__()
            elif num_args == 1:
                if hasattr(args[0], "__len__"):
                    # try to copy from expanded sequence, e.g. [[1, 0], [0, 1]]
                    self.__init__(*args[0])
                else:
                    # set all elements to the same value
                    value = mat_t.scalar_import(args[0])
                    for i in range(self._length_):
                        super().__setitem__(i, value)
            elif num_args == self._length_:
                # set all scalar elements
                for i in range(self._length_):
                    super().__setitem__(i, mat_t.scalar_import(args[i]))
            elif num_args == self._shape_[0]:
                # row vectors
                for i, row in enumerate(args):
                    if not hasattr(row, "__len__") or len(row) != self._shape_[1]:
                        raise TypeError(
                            f"Invalid argument in matrix constructor, expected row of length {self._shape_[1]}, got {row}"
                        )
                    offset = i * self._shape_[1]
                    for i in range(self._shape_[1]):
                        super().__setitem__(offset + i, mat_t.scalar_import(row[i]))
            else:
                raise ValueError(
                    f"Invalid number of arguments in matrix constructor, expected {self._length_} elements, got {num_args}"
                )

        def __add__(self, y):
            return warp.add(self, y)

        def __radd__(self, y):
            return warp.add(y, self)

        def __sub__(self, y):
            return warp.sub(self, y)

        def __rsub__(self, y):
            return warp.sub(y, self)

        def __mul__(self, y):
            return warp.mul(self, y)

        def __rmul__(self, x):
            return warp.mul(x, self)

        def __matmul__(self, y):
            return warp.mul(self, y)

        def __rmatmul__(self, x):
            return warp.mul(x, self)

        def __truediv__(self, y):
            return warp.div(self, y)

        def __rtruediv__(self, x):
            return warp.div(x, self)

        def __pos__(self):
            return warp.pos(self)

        def __neg__(self):
            return warp.neg(self)

        def __str__(self):
            row_str = []
            for r in range(self._shape_[0]):
                row_val = self.get_row(r)
                row_str.append(f"[{', '.join(map(str, row_val))}]")

            return "[" + ",\n ".join(row_str) + "]"

        def __eq__(self, other):
            for i in range(self._shape_[0]):
                for j in range(self._shape_[1]):
                    if self[i][j] != other[i][j]:
                        return False
            return True

        def get_row(self, r):
            if r < 0 or r >= self._shape_[0]:
                raise IndexError("Invalid row index")
            row_start = r * self._shape_[1]
            row_end = row_start + self._shape_[1]
            row_data = super().__getitem__(slice(row_start, row_end))
            if self._wp_scalar_type_ == float16:
                return self._wp_row_type_(*[mat_t.scalar_export(x) for x in row_data])
            else:
                return self._wp_row_type_(row_data)

        def set_row(self, r, v):
            if r < 0 or r >= self._shape_[0]:
                raise IndexError("Invalid row index")
            row_start = r * self._shape_[1]
            row_end = row_start + self._shape_[1]
            if self._wp_scalar_type_ == float16:
                v = [mat_t.scalar_import(x) for x in v]
            super().__setitem__(slice(row_start, row_end), v)

        def __getitem__(self, key):
            if isinstance(key, Tuple):
                # element indexing m[i,j]
                if len(key) != 2:
                    raise KeyError(f"Invalid key, expected one or two indices, got {len(key)}")
                return mat_t.scalar_export(super().__getitem__(key[0] * self._shape_[1] + key[1]))
            elif isinstance(key, int):
                # row vector indexing m[r]
                return self.get_row(key)
            else:
                raise KeyError(f"Invalid key {key}, expected int or pair of ints")

        def __setitem__(self, key, value):
            if isinstance(key, Tuple):
                # element indexing m[i,j] = x
                if len(key) != 2:
                    raise KeyError(f"Invalid key, expected one or two indices, got {len(key)}")
                super().__setitem__(key[0] * self._shape_[1] + key[1], mat_t.scalar_import(value))
                return value
            elif isinstance(key, int):
                # row vector indexing m[r] = v
                self.set_row(key, value)
                return value
            else:
                raise KeyError(f"Invalid key {key}, expected int or pair of ints")

        @classmethod
        def from_ptr(cls, ptr):
            if ptr:
                # create a new matrix instance and initialize the contents from the binary data
                # this skips float16 conversions, assuming that float16 data is already encoded as uint16
                value = cls()
                ctypes.memmove(ctypes.byref(value), ptr, ctypes.sizeof(cls._type_) * cls._length_)
                return value
            else:
                raise RuntimeError("NULL pointer exception")

    return mat_t


class void:
    def __init__(self):
        pass


class bool:
    _length_ = 1
    _type_ = ctypes.c_bool

    def __init__(self, x=False):
        self.value = x


class float16:
    _length_ = 1
    _type_ = ctypes.c_uint16

    def __init__(self, x=0.0):
        self.value = x


class float32:
    _length_ = 1
    _type_ = ctypes.c_float

    def __init__(self, x=0.0):
        self.value = x


class float64:
    _length_ = 1
    _type_ = ctypes.c_double

    def __init__(self, x=0.0):
        self.value = x


class int8:
    _length_ = 1
    _type_ = ctypes.c_int8

    def __init__(self, x=0):
        self.value = x


class uint8:
    _length_ = 1
    _type_ = ctypes.c_uint8

    def __init__(self, x=0):
        self.value = x


class int16:
    _length_ = 1
    _type_ = ctypes.c_int16

    def __init__(self, x=0):
        self.value = x


class uint16:
    _length_ = 1
    _type_ = ctypes.c_uint16

    def __init__(self, x=0):
        self.value = x


class int32:
    _length_ = 1
    _type_ = ctypes.c_int32

    def __init__(self, x=0):
        self.value = x


class uint32:
    _length_ = 1
    _type_ = ctypes.c_uint32

    def __init__(self, x=0):
        self.value = x


class int64:
    _length_ = 1
    _type_ = ctypes.c_int64

    def __init__(self, x=0):
        self.value = x


class uint64:
    _length_ = 1
    _type_ = ctypes.c_uint64

    def __init__(self, x=0):
        self.value = x


def quaternion(dtype=Any):
    class quat_t(vector(length=4, dtype=dtype)):
        pass
        # def __init__(self, *args):
        #     super().__init__(args)

    ret = quat_t
    ret._wp_type_params_ = [dtype]
    ret._wp_generic_type_str_ = "quat_t"
    ret._wp_constructor_ = "quaternion"

    return ret


class quath(quaternion(dtype=float16)):
    pass


class quatf(quaternion(dtype=float32)):
    pass


class quatd(quaternion(dtype=float64)):
    pass


def transformation(dtype=Any):
    class transform_t(vector(length=7, dtype=dtype)):
        _wp_init_from_components_sig_ = inspect.Signature(
            (
                inspect.Parameter(
                    "p",
                    inspect.Parameter.POSITIONAL_OR_KEYWORD,
                    default=(0.0, 0.0, 0.0),
                ),
                inspect.Parameter(
                    "q",
                    inspect.Parameter.POSITIONAL_OR_KEYWORD,
                    default=(0.0, 0.0, 0.0, 1.0),
                ),
            ),
        )
        _wp_type_params_ = [dtype]
        _wp_generic_type_str_ = "transform_t"
        _wp_constructor_ = "transformation"

        def __init__(self, *args, **kwargs):
            if len(args) == 1 and len(kwargs) == 0:
                if getattr(args[0], "_wp_generic_type_str_") == self._wp_generic_type_str_:
                    # Copy constructor.
                    super().__init__(*args[0])
                    return

            try:
                # For backward compatibility, try to check if the arguments
                # match the original signature that'd allow initializing
                # the `p` and `q` components separately.
                bound_args = self._wp_init_from_components_sig_.bind(*args, **kwargs)
                bound_args.apply_defaults()
                p, q = bound_args.args
            except (TypeError, ValueError):
                # Fallback to the vector's constructor.
                super().__init__(*args)
                return

            # Even if the arguments match the original “from components”
            # signature, we still need to make sure that they represent
            # sequences that can be unpacked.
            if hasattr(p, "__len__") and hasattr(q, "__len__"):
                # Initialize from the `p` and `q` components.
                super().__init__()
                self[0:3] = vector(length=3, dtype=dtype)(*p)
                self[3:7] = quaternion(dtype=dtype)(*q)
                return

            # Fallback to the vector's constructor.
            super().__init__(*args)

        @property
        def p(self):
            return vec3(self[0:3])

        @property
        def q(self):
            return quat(self[3:7])

    return transform_t


class transformh(transformation(dtype=float16)):
    pass


class transformf(transformation(dtype=float32)):
    pass


class transformd(transformation(dtype=float64)):
    pass


class vec2h(vector(length=2, dtype=float16)):
    pass


class vec3h(vector(length=3, dtype=float16)):
    pass


class vec4h(vector(length=4, dtype=float16)):
    pass


class vec2f(vector(length=2, dtype=float32)):
    pass


class vec3f(vector(length=3, dtype=float32)):
    pass


class vec4f(vector(length=4, dtype=float32)):
    pass


class vec2d(vector(length=2, dtype=float64)):
    pass


class vec3d(vector(length=3, dtype=float64)):
    pass


class vec4d(vector(length=4, dtype=float64)):
    pass


class vec2b(vector(length=2, dtype=int8)):
    pass


class vec3b(vector(length=3, dtype=int8)):
    pass


class vec4b(vector(length=4, dtype=int8)):
    pass


class vec2ub(vector(length=2, dtype=uint8)):
    pass


class vec3ub(vector(length=3, dtype=uint8)):
    pass


class vec4ub(vector(length=4, dtype=uint8)):
    pass


class vec2s(vector(length=2, dtype=int16)):
    pass


class vec3s(vector(length=3, dtype=int16)):
    pass


class vec4s(vector(length=4, dtype=int16)):
    pass


class vec2us(vector(length=2, dtype=uint16)):
    pass


class vec3us(vector(length=3, dtype=uint16)):
    pass


class vec4us(vector(length=4, dtype=uint16)):
    pass


class vec2i(vector(length=2, dtype=int32)):
    pass


class vec3i(vector(length=3, dtype=int32)):
    pass


class vec4i(vector(length=4, dtype=int32)):
    pass


class vec2ui(vector(length=2, dtype=uint32)):
    pass


class vec3ui(vector(length=3, dtype=uint32)):
    pass


class vec4ui(vector(length=4, dtype=uint32)):
    pass


class vec2l(vector(length=2, dtype=int64)):
    pass


class vec3l(vector(length=3, dtype=int64)):
    pass


class vec4l(vector(length=4, dtype=int64)):
    pass


class vec2ul(vector(length=2, dtype=uint64)):
    pass


class vec3ul(vector(length=3, dtype=uint64)):
    pass


class vec4ul(vector(length=4, dtype=uint64)):
    pass


class mat22h(matrix(shape=(2, 2), dtype=float16)):
    pass


class mat33h(matrix(shape=(3, 3), dtype=float16)):
    pass


class mat44h(matrix(shape=(4, 4), dtype=float16)):
    pass


class mat22f(matrix(shape=(2, 2), dtype=float32)):
    pass


class mat33f(matrix(shape=(3, 3), dtype=float32)):
    pass


class mat44f(matrix(shape=(4, 4), dtype=float32)):
    pass


class mat22d(matrix(shape=(2, 2), dtype=float64)):
    pass


class mat33d(matrix(shape=(3, 3), dtype=float64)):
    pass


class mat44d(matrix(shape=(4, 4), dtype=float64)):
    pass


class spatial_vectorh(vector(length=6, dtype=float16)):
    pass


class spatial_vectorf(vector(length=6, dtype=float32)):
    pass


class spatial_vectord(vector(length=6, dtype=float64)):
    pass


class spatial_matrixh(matrix(shape=(6, 6), dtype=float16)):
    pass


class spatial_matrixf(matrix(shape=(6, 6), dtype=float32)):
    pass


class spatial_matrixd(matrix(shape=(6, 6), dtype=float64)):
    pass


# built-in type aliases that default to 32bit precision
vec2 = vec2f
vec3 = vec3f
vec4 = vec4f
mat22 = mat22f
mat33 = mat33f
mat44 = mat44f
quat = quatf
transform = transformf
spatial_vector = spatial_vectorf
spatial_matrix = spatial_matrixf


int_types = [int8, uint8, int16, uint16, int32, uint32, int64, uint64]
float_types = [float16, float32, float64]
scalar_types = int_types + float_types

vector_types = [
    vec2b,
    vec2ub,
    vec2s,
    vec2us,
    vec2i,
    vec2ui,
    vec2l,
    vec2ul,
    vec2h,
    vec2f,
    vec2d,
    vec3b,
    vec3ub,
    vec3s,
    vec3us,
    vec3i,
    vec3ui,
    vec3l,
    vec3ul,
    vec3h,
    vec3f,
    vec3d,
    vec4b,
    vec4ub,
    vec4s,
    vec4us,
    vec4i,
    vec4ui,
    vec4l,
    vec4ul,
    vec4h,
    vec4f,
    vec4d,
    mat22h,
    mat22f,
    mat22d,
    mat33h,
    mat33f,
    mat33d,
    mat44h,
    mat44f,
    mat44d,
    quath,
    quatf,
    quatd,
    transformh,
    transformf,
    transformd,
    spatial_vectorh,
    spatial_vectorf,
    spatial_vectord,
    spatial_matrixh,
    spatial_matrixf,
    spatial_matrixd,
]

np_dtype_to_warp_type = {
    np.dtype(np.bool_): bool,
    np.dtype(np.int8): int8,
    np.dtype(np.uint8): uint8,
    np.dtype(np.int16): int16,
    np.dtype(np.uint16): uint16,
    np.dtype(np.int32): int32,
    np.dtype(np.int64): int64,
    np.dtype(np.uint32): uint32,
    np.dtype(np.uint64): uint64,
    np.dtype(np.byte): int8,
    np.dtype(np.ubyte): uint8,
    np.dtype(np.float16): float16,
    np.dtype(np.float32): float32,
    np.dtype(np.float64): float64,
}

warp_type_to_np_dtype = {
    bool: np.bool_,
    int8: np.int8,
    int16: np.int16,
    int32: np.int32,
    int64: np.int64,
    uint8: np.uint8,
    uint16: np.uint16,
    uint32: np.uint32,
    uint64: np.uint64,
    float16: np.float16,
    float32: np.float32,
    float64: np.float64,
}


# represent a Python range iterator
class range_t:
    def __init__(self):
        pass


# definition just for kernel type (cannot be a parameter), see bvh.h
class bvh_query_t:
    """Object used to track state during BVH traversal."""
    def __init__(self):
        pass


# definition just for kernel type (cannot be a parameter), see mesh.h
class mesh_query_aabb_t:
    """Object used to track state during mesh traversal."""
    def __init__(self):
        pass


# definition just for kernel type (cannot be a parameter), see hash_grid.h
class hash_grid_query_t:
    """Object used to track state during neighbor traversal."""
    def __init__(self):
        pass


# maximum number of dimensions, must match array.h
ARRAY_MAX_DIMS = 4
LAUNCH_MAX_DIMS = 4

# must match array.h
ARRAY_TYPE_REGULAR = 0
ARRAY_TYPE_INDEXED = 1
ARRAY_TYPE_FABRIC = 2
ARRAY_TYPE_FABRIC_INDEXED = 3


# represents bounds for kernel launch (number of threads across multiple dimensions)
class launch_bounds_t(ctypes.Structure):
    _fields_ = [("shape", ctypes.c_int32 * LAUNCH_MAX_DIMS), ("ndim", ctypes.c_int32), ("size", ctypes.c_size_t)]

    def __init__(self, shape):
        if isinstance(shape, int):
            # 1d launch
            self.ndim = 1
            self.size = shape
            self.shape[0] = shape

        else:
            # nd launch
            self.ndim = len(shape)
            self.size = 1

            for i in range(self.ndim):
                self.shape[i] = shape[i]
                self.size = self.size * shape[i]

        # initialize the remaining dims to 1
        for i in range(self.ndim, LAUNCH_MAX_DIMS):
            self.shape[i] = 1


class shape_t(ctypes.Structure):
    _fields_ = [("dims", ctypes.c_int32 * ARRAY_MAX_DIMS)]

    def __init__(self):
        pass


class array_t(ctypes.Structure):
    _fields_ = [
        ("data", ctypes.c_uint64),
        ("grad", ctypes.c_uint64),
        ("shape", ctypes.c_int32 * ARRAY_MAX_DIMS),
        ("strides", ctypes.c_int32 * ARRAY_MAX_DIMS),
        ("ndim", ctypes.c_int32),
    ]

    def __init__(self, data=0, grad=0, ndim=0, shape=(0,), strides=(0,)):
        self.data = data
        self.grad = grad
        self.ndim = ndim
        for i in range(ndim):
            self.shape[i] = shape[i]
            self.strides[i] = strides[i]

    # structured type description used when array_t is packed in a struct and shared via numpy structured array.
    @classmethod
    def numpy_dtype(cls):
        return cls._numpy_dtype_

    # structured value used when array_t is packed in a struct and shared via a numpy structured array
    def numpy_value(self):
        return (self.data, self.grad, list(self.shape), list(self.strides), self.ndim)


# NOTE: must match array_t._fields_
array_t._numpy_dtype_ = {
    "names": ["data", "grad", "shape", "strides", "ndim"],
    "formats": ["u8", "u8", f"{ARRAY_MAX_DIMS}i4", f"{ARRAY_MAX_DIMS}i4", "i4"],
    "offsets": [
        array_t.data.offset,
        array_t.grad.offset,
        array_t.shape.offset,
        array_t.strides.offset,
        array_t.ndim.offset,
    ],
    "itemsize": ctypes.sizeof(array_t),
}


class indexedarray_t(ctypes.Structure):
    _fields_ = [
        ("data", array_t),
        ("indices", ctypes.c_void_p * ARRAY_MAX_DIMS),
        ("shape", ctypes.c_int32 * ARRAY_MAX_DIMS),
    ]

    def __init__(self, data, indices, shape):
        if data is None:
            self.data = array().__ctype__()
            for i in range(ARRAY_MAX_DIMS):
                self.indices[i] = ctypes.c_void_p(None)
                self.shape[i] = 0
        else:
            self.data = data.__ctype__()
            for i in range(data.ndim):
                if indices[i] is not None:
                    self.indices[i] = ctypes.c_void_p(indices[i].ptr)
                else:
                    self.indices[i] = ctypes.c_void_p(None)
                self.shape[i] = shape[i]


def type_ctype(dtype):
    if dtype == float:
        return ctypes.c_float
    elif dtype == int:
        return ctypes.c_int32
    else:
        # scalar type
        return dtype._type_


def type_length(dtype):
    if dtype == float or dtype == int or isinstance(dtype, warp.codegen.Struct):
        return 1
    else:
        return dtype._length_


def type_scalar_type(dtype):
    return getattr(dtype, "_wp_scalar_type_", dtype)


def type_size_in_bytes(dtype):
    if dtype.__module__ == "ctypes":
        return ctypes.sizeof(dtype)
    elif isinstance(dtype, warp.codegen.Struct):
        return ctypes.sizeof(dtype.ctype)
    elif dtype == float or dtype == int:
        return 4
    elif hasattr(dtype, "_type_"):
        return getattr(dtype, "_length_", 1) * ctypes.sizeof(dtype._type_)

    else:
        return 0


def type_to_warp(dtype):
    if dtype == float:
        return float32
    elif dtype == int:
        return int32
    else:
        return dtype


def type_typestr(dtype):
    if dtype == bool:
        return "?"
    elif dtype == float16:
        return "<f2"
    elif dtype == float32:
        return "<f4"
    elif dtype == float64:
        return "<f8"
    elif dtype == int8:
        return "b"
    elif dtype == uint8:
        return "B"
    elif dtype == int16:
        return "<i2"
    elif dtype == uint16:
        return "<u2"
    elif dtype == int32:
        return "<i4"
    elif dtype == uint32:
        return "<u4"
    elif dtype == int64:
        return "<i8"
    elif dtype == uint64:
        return "<u8"
    elif isinstance(dtype, warp.codegen.Struct):
        return f"|V{ctypes.sizeof(dtype.ctype)}"
    elif issubclass(dtype, ctypes.Array):
        return type_typestr(dtype._wp_scalar_type_)
    else:
        raise Exception("Unknown ctype")


# converts any known type to a human readable string, good for error messages, reporting etc
def type_repr(t):
    if is_array(t):
        return str(f"array(ndim={t.ndim}, dtype={t.dtype})")
    if type_is_vector(t):
        return str(f"vector(length={t._shape_[0]}, dtype={t._wp_scalar_type_})")
    if type_is_matrix(t):
        return str(f"matrix(shape=({t._shape_[0]}, {t._shape_[1]}), dtype={t._wp_scalar_type_})")
    if isinstance(t, warp.codegen.Struct):
        return type_repr(t.cls)
    if t in scalar_types:
        return t.__name__

    try:
        return t.__module__ + "." + t.__qualname__
    except AttributeError:
        return str(t)


def type_is_int(t):
    if t == int:
        t = int32

    return t in int_types


def type_is_float(t):
    if t == float:
        t = float32

    return t in float_types


# returns True if the passed *type* is a vector
def type_is_vector(t):
    if hasattr(t, "_wp_generic_type_str_") and t._wp_generic_type_str_ == "vec_t":
        return True
    else:
        return False


# returns True if the passed *type* is a matrix
def type_is_matrix(t):
    if hasattr(t, "_wp_generic_type_str_") and t._wp_generic_type_str_ == "mat_t":
        return True
    else:
        return False


# returns true for all value types (int, float, bool, scalars, vectors, matrices)
def type_is_value(x):
    if (x == int) or (x == float) or (x == builtins.bool) or (x in scalar_types) or issubclass(x, ctypes.Array):
        return True
    else:
        return False


# equivalent of the above but for values
def is_int(x):
    return type_is_int(type(x))


def is_float(x):
    return type_is_float(type(x))


def is_value(x):
    return type_is_value(type(x))


# returns true if the passed *instance* is one of the array types
def is_array(a):
    return isinstance(a, array_types)


def types_equal(a, b, match_generic=False):
    # convert to canonical types
    if a == float:
        a = float32
    elif a == int:
        a = int32

    if b == float:
        b = float32
    elif b == int:
        b = int32

    compatible_bool_types = [builtins.bool, bool]

    def are_equal(p1, p2):
        if match_generic:
            if p1 == Any or p2 == Any:
                return True
            if p1 == Scalar and p2 in scalar_types:
                return True
            if p2 == Scalar and p1 in scalar_types:
                return True
            if p1 == Scalar and p2 == Scalar:
                return True
            if p1 == Float and p2 in float_types:
                return True
            if p2 == Float and p1 in float_types:
                return True
            if p1 == Float and p2 == Float:
                return True

        # convert to canonical types
        if p1 == float:
            p1 = float32
        elif p1 == int:
            p1 = int32

        if p2 == float:
            p2 = float32
        elif b == int:
            p2 = int32

        if p1 in compatible_bool_types and p2 in compatible_bool_types:
            return True
        else:
            return p1 == p2

    if (
        hasattr(a, "_wp_generic_type_str_")
        and hasattr(b, "_wp_generic_type_str_")
        and a._wp_generic_type_str_ == b._wp_generic_type_str_
    ):
        return all([are_equal(p1, p2) for p1, p2 in zip(a._wp_type_params_, b._wp_type_params_)])
    if is_array(a) and type(a) is type(b):
        return True
    else:
        return are_equal(a, b)


def strides_from_shape(shape: Tuple, dtype):
    ndims = len(shape)
    strides = [None] * ndims

    i = ndims - 1
    strides[i] = type_size_in_bytes(dtype)

    while i > 0:
        strides[i - 1] = strides[i] * shape[i]
        i -= 1

    return tuple(strides)


class array(Array):
    # member attributes available during code-gen (e.g.: d = array.shape[0])
    # (initialized when needed)
    _vars = None

    def __init__(
        self,
        data=None,
        dtype: DType = Any,
        shape=None,
        strides=None,
        length=None,
        ptr=None,
        capacity=None,
        device=None,
        pinned=False,
        copy=True,
        owner=True,  # TODO: replace with deleter=None
        ndim=None,
        grad=None,
        requires_grad=False,
    ):
        """Constructs a new Warp array object

        When the ``data`` argument is a valid list, tuple, or ndarray the array will be constructed from this object's data.
        For objects that are not stored sequentially in memory (e.g.: a list), then the data will first
        be flattened before being transferred to the memory space given by device.

        The second construction path occurs when the ``ptr`` argument is a non-zero uint64 value representing the
        start address in memory where existing array data resides, e.g.: from an external or C-library. The memory
        allocation should reside on the same device given by the device argument, and the user should set the length
        and dtype parameter appropriately.

        If neither ``data`` nor ``ptr`` are specified, the ``shape`` or ``length`` arguments are checked next.
        This construction path can be used to create new uninitialized arrays, but users are encouraged to call
        ``wp.empty()``, ``wp.zeros()``, or ``wp.full()`` instead to create new arrays.

        If none of the above arguments are specified, a simple type annotation is constructed.  This is used when annotating
        kernel arguments or struct members (e.g.,``arr: wp.array(dtype=float)``).  In this case, only ``dtype`` and ``ndim``
        are taken into account and no memory is allocated for the array.

        Args:
            data (Union[list, tuple, ndarray]) An object to construct the array from, can be a Tuple, List, or generally any type convertible to an np.array
            dtype (Union): One of the built-in types, e.g.: :class:`warp.mat33`, if dtype is Any and data an ndarray then it will be inferred from the array data type
            shape (tuple): Dimensions of the array
            strides (tuple): Number of bytes in each dimension between successive elements of the array
            length (int): Number of elements of the data type (deprecated, users should use `shape` argument)
            ptr (uint64): Address of an external memory address to alias (data should be None)
            capacity (int): Maximum size in bytes of the ptr allocation (data should be None)
            device (Devicelike): Device the array lives on
            copy (bool): Whether the incoming data will be copied or aliased, this is only possible when the incoming `data` already lives on the device specified and types match
            owner (bool): Should the array object try to deallocate memory when it is deleted
            requires_grad (bool): Whether or not gradients will be tracked for this array, see :class:`warp.Tape` for details
            grad (array): The gradient array to use
            pinned (bool): Whether to allocate pinned host memory, which allows asynchronous host-device transfers (only applicable with device="cpu")

        """

        self.owner = False
        self.ctype = None
        self._requires_grad = False
        self._grad = None
        # __array_interface__ or __cuda_array_interface__, evaluated lazily and cached
        self._array_interface = None
        self.is_transposed = False

        # canonicalize dtype
        if dtype == int:
            dtype = int32
        elif dtype == float:
            dtype = float32

        # convert shape to tuple (or leave shape=None if neither shape nor length were specified)
        if shape is not None:
            if isinstance(shape, int):
                shape = (shape,)
            else:
                shape = tuple(shape)
                if len(shape) > ARRAY_MAX_DIMS:
                    raise RuntimeError(
                        f"Failed to create array with shape {shape}, the maximum number of dimensions is {ARRAY_MAX_DIMS}"
                    )
        elif length is not None:
            # backward compatibility
            shape = (length,)

        # determine the construction path from the given arguments
        if data is not None:
            # data or ptr, not both
            if ptr is not None:
                raise RuntimeError("Can only construct arrays with either `data` or `ptr` arguments, not both")
            self._init_from_data(data, dtype, shape, device, copy, pinned)
        elif ptr is not None:
            self._init_from_ptr(ptr, dtype, shape, strides, capacity, device, owner, pinned)
        elif shape is not None:
            self._init_new(dtype, shape, strides, device, pinned)
        else:
            self._init_annotation(dtype, ndim or 1)

        # initialize gradient, if needed
        if self.device is not None:
            if grad is not None:
                # this will also check whether the gradient array is compatible
                self.grad = grad
            else:
                # allocate gradient if needed
                self._requires_grad = requires_grad
                if requires_grad:
                    with warp.ScopedStream(self.device.null_stream):
                        self._alloc_grad()

    def _init_from_data(self, data, dtype, shape, device, copy, pinned):
        if not hasattr(data, "__len__"):
            raise RuntimeError(f"Data must be a sequence or array, got scalar {data}")

        if hasattr(dtype, "_wp_scalar_type_"):
            dtype_shape = dtype._shape_
            dtype_ndim = len(dtype_shape)
            scalar_dtype = dtype._wp_scalar_type_
        else:
            dtype_shape = ()
            dtype_ndim = 0
            scalar_dtype = dtype

        # convert input data to ndarray (handles lists, tuples, etc.) and determine dtype
        if dtype == Any:
            # infer dtype from data
            try:
                arr = np.array(data, copy=False, ndmin=1)
            except Exception as e:
                raise RuntimeError(f"Failed to convert input data to an array: {e}")
            dtype = np_dtype_to_warp_type.get(arr.dtype)
            if dtype is None:
                raise RuntimeError(f"Unsupported input data dtype: {arr.dtype}")
        elif isinstance(dtype, warp.codegen.Struct):
            if isinstance(data, np.ndarray):
                # construct from numpy structured array
                if data.dtype != dtype.numpy_dtype():
                    raise RuntimeError(
                        f"Invalid source data type for array of structs, expected {dtype.numpy_dtype()}, got {data.dtype}"
                    )
                arr = data
            elif isinstance(data, (list, tuple)):
                # construct from a sequence of structs
                try:
                    # convert each struct instance to its corresponding ctype
                    ctype_list = [v.__ctype__() for v in data]
                    # convert the list of ctypes to a contiguous ctypes array
                    ctype_arr = (dtype.ctype * len(ctype_list))(*ctype_list)
                    # convert to numpy
                    arr = np.frombuffer(ctype_arr, dtype=dtype.ctype)
                except Exception as e:
                    raise RuntimeError(
                        f"Error while trying to construct Warp array from a sequence of Warp structs: {e}"
                    )
            else:
                raise RuntimeError(
                    "Invalid data argument for array of structs, expected a sequence of structs or a NumPy structured array"
                )
        else:
            # convert input data to the given dtype
            npdtype = warp_type_to_np_dtype.get(scalar_dtype)
            if npdtype is None:
                raise RuntimeError(
                    f"Failed to convert input data to an array with Warp type {warp.context.type_str(dtype)}"
                )
            try:
                arr = np.array(data, dtype=npdtype, copy=False, ndmin=1)
            except Exception as e:
                raise RuntimeError(f"Failed to convert input data to an array with type {npdtype}: {e}")

        # determine whether the input needs reshaping
        target_npshape = None
        if shape is not None:
            target_npshape = (*shape, *dtype_shape)
        elif dtype_ndim > 0:
            # prune inner dimensions of length 1
            while arr.ndim > 1 and arr.shape[-1] == 1:
                arr = np.squeeze(arr, axis=-1)
            # if the inner dims don't match exactly, check if the innermost dim is a multiple of type length
            if arr.ndim < dtype_ndim or arr.shape[-dtype_ndim:] != dtype_shape:
                if arr.shape[-1] == dtype._length_:
                    target_npshape = (*arr.shape[:-1], *dtype_shape)
                elif arr.shape[-1] % dtype._length_ == 0:
                    target_npshape = (*arr.shape[:-1], arr.shape[-1] // dtype._length_, *dtype_shape)
                else:
                    if dtype_ndim == 1:
                        raise RuntimeError(
                            f"The inner dimensions of the input data are not compatible with the requested vector type {warp.context.type_str(dtype)}: expected an inner dimension that is a multiple of {dtype._length_}"
                        )
                    else:
                        raise RuntimeError(
                            f"The inner dimensions of the input data are not compatible with the requested matrix type {warp.context.type_str(dtype)}: expected inner dimensions {dtype._shape_} or a multiple of {dtype._length_}"
                        )

        if target_npshape is not None:
            try:
                arr = arr.reshape(target_npshape)
            except Exception as e:
                raise RuntimeError(
                    f"Failed to reshape the input data to the given shape {shape} and type {warp.context.type_str(dtype)}: {e}"
                )

        # determine final shape and strides
        if dtype_ndim > 0:
            # make sure the inner dims are contiguous for vector/matrix types
            scalar_size = type_size_in_bytes(dtype._wp_scalar_type_)
            inner_contiguous = arr.strides[-1] == scalar_size
            if inner_contiguous and dtype_ndim > 1:
                inner_contiguous = arr.strides[-2] == scalar_size * dtype_shape[-1]

            if not inner_contiguous:
                arr = np.ascontiguousarray(arr)

            shape = arr.shape[:-dtype_ndim] or (1,)
            strides = arr.strides[:-dtype_ndim] or (type_size_in_bytes(dtype),)
        else:
            shape = arr.shape or (1,)
            strides = arr.strides or (type_size_in_bytes(dtype),)

        device = warp.get_device(device)

        if device.is_cpu and not copy and not pinned:
            # reference numpy memory directly
            self._init_from_ptr(arr.ctypes.data, dtype, shape, strides, None, device, False, False)
            # keep a ref to the source array to keep allocation alive
            self._ref = arr
        else:
            # copy data into a new array
            self._init_new(dtype, shape, None, device, pinned)
            src = array(
                ptr=arr.ctypes.data,
                dtype=dtype,
                shape=shape,
                strides=strides,
                device="cpu",
                copy=False,
                owner=False,
            )
            warp.copy(self, src)

    def _init_from_ptr(self, ptr, dtype, shape, strides, capacity, device, owner, pinned):
        if dtype == Any:
            raise RuntimeError("A concrete data type is required to create the array")

        device = warp.get_device(device)

        size = 1
        for d in shape:
            size *= d

        contiguous_strides = strides_from_shape(shape, dtype)

        if strides is None:
            strides = contiguous_strides
            is_contiguous = True
            if capacity is None:
                capacity = size * type_size_in_bytes(dtype)
        else:
            is_contiguous = strides == contiguous_strides
            if capacity is None:
                capacity = shape[0] * strides[0]

        self.dtype = dtype
        self.ndim = len(shape)
        self.size = size
        self.capacity = capacity
        self.shape = shape
        self.strides = strides
        self.ptr = ptr
        self.device = device
        self.owner = owner
        self.pinned = pinned if device.is_cpu else False
        self.is_contiguous = is_contiguous

    def _init_new(self, dtype, shape, strides, device, pinned):
        if dtype == Any:
            raise RuntimeError("A concrete data type is required to create the array")

        device = warp.get_device(device)

        size = 1
        for d in shape:
            size *= d

        contiguous_strides = strides_from_shape(shape, dtype)

        if strides is None:
            strides = contiguous_strides
            is_contiguous = True
            capacity = size * type_size_in_bytes(dtype)
        else:
            is_contiguous = strides == contiguous_strides
            capacity = shape[0] * strides[0]

        if capacity > 0:
            ptr = device.allocator.alloc(capacity, pinned=pinned)
            if ptr is None:
                raise RuntimeError(f"Array allocation failed on device: {device} for {capacity} bytes")
        else:
            ptr = None

        self.dtype = dtype
        self.ndim = len(shape)
        self.size = size
        self.capacity = capacity
        self.shape = shape
        self.strides = strides
        self.ptr = ptr
        self.device = device
        self.owner = True
        self.pinned = pinned if device.is_cpu else False
        self.is_contiguous = is_contiguous

    def _init_annotation(self, dtype, ndim):
        self.dtype = dtype
        self.ndim = ndim
        self.size = 0
        self.capacity = 0
        self.shape = (0,) * ndim
        self.strides = (0,) * ndim
        self.ptr = None
        self.device = None
        self.owner = False
        self.pinned = False
        self.is_contiguous = False

    @property
    def __array_interface__(self):
        # raising an AttributeError here makes hasattr() return False
        if self.device is None or not self.device.is_cpu:
            raise AttributeError(f"__array_interface__ not supported because device is {self.device}")

        if self._array_interface is None:
            # get flat shape (including type shape)
            if isinstance(self.dtype, warp.codegen.Struct):
                # struct
                arr_shape = self.shape
                arr_strides = self.strides
                descr = self.dtype.numpy_dtype()
            elif issubclass(self.dtype, ctypes.Array):
                # vector type, flatten the dimensions into one tuple
                arr_shape = (*self.shape, *self.dtype._shape_)
                dtype_strides = strides_from_shape(self.dtype._shape_, self.dtype._type_)
                arr_strides = (*self.strides, *dtype_strides)
                descr = None
            else:
                # scalar type
                arr_shape = self.shape
                arr_strides = self.strides
                descr = None

            self._array_interface = {
                "data": (self.ptr if self.ptr is not None else 0, False),
                "shape": tuple(arr_shape),
                "strides": tuple(arr_strides),
                "typestr": type_typestr(self.dtype),
                "descr": descr,  # optional description of structured array layout
                "version": 3,
            }

        return self._array_interface

    @property
    def __cuda_array_interface__(self):
        # raising an AttributeError here makes hasattr() return False
        if self.device is None or not self.device.is_cuda:
            raise AttributeError(f"__cuda_array_interface__ is not supported because device is {self.device}")

        if self._array_interface is None:
            # get flat shape (including type shape)
            if issubclass(self.dtype, ctypes.Array):
                # vector type, flatten the dimensions into one tuple
                arr_shape = (*self.shape, *self.dtype._shape_)
                dtype_strides = strides_from_shape(self.dtype._shape_, self.dtype._type_)
                arr_strides = (*self.strides, *dtype_strides)
            else:
                # scalar or struct type
                arr_shape = self.shape
                arr_strides = self.strides

            self._array_interface = {
                "data": (self.ptr if self.ptr is not None else 0, False),
                "shape": tuple(arr_shape),
                "strides": tuple(arr_strides),
                "typestr": type_typestr(self.dtype),
                "version": 2,
            }

        return self._array_interface

    def __del__(self):
        if self.owner:
            # use CUDA context guard to avoid side effects during garbage collection
            with self.device.context_guard:
                self.device.allocator.free(self.ptr, self.capacity, self.pinned)

    def __len__(self):
        return self.shape[0]

    def __str__(self):
        if self.device is None:
            # for 'empty' arrays we just return the type information, these are used in kernel function signatures
            return f"array{self.dtype}"
        else:
            return str(self.numpy())

    def __getitem__(self, key):
        if isinstance(key, int):
            if self.ndim == 1:
                raise RuntimeError("Item indexing is not supported on wp.array objects")
            key = [key]
        elif isinstance(key, (slice, array)):
            key = [key]
        elif isinstance(key, Tuple):
            contains_slice = False
            contains_indices = False
            for k in key:
                if isinstance(k, slice):
                    contains_slice = True
                if isinstance(k, array):
                    contains_indices = True
            if not contains_slice and not contains_indices and len(key) == self.ndim:
                raise RuntimeError("Item indexing is not supported on wp.array objects")
        else:
            raise RuntimeError(f"Invalid index: {key}")

        new_key = []
        for i in range(0, len(key)):
            new_key.append(key[i])
        for i in range(len(key), self.ndim):
            new_key.append(slice(None, None, None))
        key = tuple(new_key)

        new_shape = []
        new_strides = []
        ptr_offset = 0
        new_dim = self.ndim

        # maps dimension index to an array of indices, if given
        index_arrays = {}

        for idx, k in enumerate(key):
            if isinstance(k, slice):
                start, stop, step = k.start, k.stop, k.step
                if start is None:
                    start = 0
                if stop is None:
                    stop = self.shape[idx]
                if step is None:
                    step = 1
                if start < 0:
                    start = self.shape[idx] + start
                if stop < 0:
                    stop = self.shape[idx] + stop

                if start < 0 or start >= self.shape[idx]:
                    raise RuntimeError(f"Invalid indexing in slice: {start}:{stop}:{step}")
                if stop < 1 or stop > self.shape[idx]:
                    raise RuntimeError(f"Invalid indexing in slice: {start}:{stop}:{step}")
                if stop <= start:
                    raise RuntimeError(f"Invalid indexing in slice: {start}:{stop}:{step}")

                new_shape.append(-((stop - start) // -step))  # ceil division
                new_strides.append(self.strides[idx] * step)

                ptr_offset += self.strides[idx] * start

            elif isinstance(k, array):
                # note: index array properties will be checked during indexedarray construction
                index_arrays[idx] = k

                # shape and strides are unchanged for this dimension
                new_shape.append(self.shape[idx])
                new_strides.append(self.strides[idx])

            else:  # is int
                start = k
                if start < 0:
                    start = self.shape[idx] + start
                if start < 0 or start >= self.shape[idx]:
                    raise RuntimeError(f"Invalid indexing in slice: {k}")
                new_dim -= 1

                ptr_offset += self.strides[idx] * start

        # handle grad
        if self.grad is not None:
            new_grad = array(
                ptr=self.grad.ptr + ptr_offset if self.grad.ptr is not None else None,
                dtype=self.grad.dtype,
                shape=tuple(new_shape),
                strides=tuple(new_strides),
                device=self.grad.device,
                pinned=self.grad.pinned,
                owner=False,
            )
            # store back-ref to stop data being destroyed
            new_grad._ref = self.grad
        else:
            new_grad = None

        a = array(
            ptr=self.ptr + ptr_offset if self.ptr is not None else None,
            dtype=self.dtype,
            shape=tuple(new_shape),
            strides=tuple(new_strides),
            device=self.device,
            pinned=self.pinned,
            owner=False,
            grad=new_grad,
        )

        # store back-ref to stop data being destroyed
        a._ref = self

        if index_arrays:
            indices = [None] * self.ndim
            for dim, index_array in index_arrays.items():
                indices[dim] = index_array
            return indexedarray(a, indices)
        else:
            return a

    # construct a C-representation of the array for passing to kernels
    def __ctype__(self):
        if self.ctype is None:
            data = 0 if self.ptr is None else ctypes.c_uint64(self.ptr)
            grad = 0 if self.grad is None or self.grad.ptr is None else ctypes.c_uint64(self.grad.ptr)
            self.ctype = array_t(data=data, grad=grad, ndim=self.ndim, shape=self.shape, strides=self.strides)

        return self.ctype

    def __matmul__(self, other):
        """
        Enables A @ B syntax for matrix multiplication
        """
        if self.ndim != 2 or other.ndim != 2:
            raise RuntimeError(
                "A has dim = {}, B has dim = {}. If multiplying with @, A and B must have dim = 2.".format(
                    self.ndim, other.ndim
                )
            )

        m = self.shape[0]
        n = other.shape[1]
        c = warp.zeros(shape=(m, n), dtype=self.dtype, device=self.device, requires_grad=True)
        d = warp.zeros(shape=(m, n), dtype=self.dtype, device=self.device, requires_grad=True)
        matmul(self, other, c, d, device=self.device)
        return d

    @property
    def grad(self):
        return self._grad

    @grad.setter
    def grad(self, grad):
        if grad is None:
            self._grad = None
            self._requires_grad = False
        else:
            # make sure the given gradient array is compatible
            if (
                grad.dtype != self.dtype
                or grad.shape != self.shape
                or grad.strides != self.strides
                or grad.device != self.device
            ):
                raise ValueError("The given gradient array is incompatible")
            self._grad = grad
            self._requires_grad = True

        # trigger re-creation of C-representation
        self.ctype = None

    @property
    def requires_grad(self):
        return self._requires_grad

    @requires_grad.setter
    def requires_grad(self, value: builtins.bool):
        if value and self._grad is None:
            self._alloc_grad()
        elif not value:
            self._grad = None

        self._requires_grad = value

        # trigger re-creation of C-representation
        self.ctype = None

    def _alloc_grad(self):
        self._grad = array(
            dtype=self.dtype, shape=self.shape, strides=self.strides, device=self.device, pinned=self.pinned
        )
        self._grad.zero_()

        # trigger re-creation of C-representation
        self.ctype = None

    @property
    def vars(self):
        # member attributes available during code-gen (e.g.: d = array.shape[0])
        # Note: we use a shared dict for all array instances
        if array._vars is None:
            array._vars = {"shape": warp.codegen.Var("shape", shape_t)}
        return array._vars

    def zero_(self):
        """Zeroes-out the array entires."""
        if self.is_contiguous:
            # simple memset is usually faster than generic fill
            self.device.memset(self.ptr, 0, self.size * type_size_in_bytes(self.dtype))
        else:
            self.fill_(0)

    def fill_(self, value):
        """Set all array entries to `value`

        args:
            value: The value to set every array entry to. Must be convertible to the array's ``dtype``.

        Raises:
            ValueError: If `value` cannot be converted to the array's ``dtype``.

        Examples:
            ``fill_()`` can take lists or other sequences when filling arrays of vectors or matrices.

            >>> arr = wp.zeros(2, dtype=wp.mat22)
            >>> arr.numpy()
            array([[[0., 0.],
                    [0., 0.]],
            <BLANKLINE>
                   [[0., 0.],
                    [0., 0.]]], dtype=float32)
            >>> arr.fill_([[1, 2], [3, 4]])
            >>> arr.numpy()
            array([[[1., 2.],
                    [3., 4.]],
            <BLANKLINE>
                   [[1., 2.],
                    [3., 4.]]], dtype=float32)
        """
        if self.size == 0:
            return

        # try to convert the given value to the array dtype
        try:
            if isinstance(self.dtype, warp.codegen.Struct):
                if isinstance(value, self.dtype.cls):
                    cvalue = value.__ctype__()
                elif value == 0:
                    # allow zero-initializing structs using default constructor
                    cvalue = self.dtype().__ctype__()
                else:
                    raise ValueError(
                        f"Invalid initializer value for struct {self.dtype.cls.__name__}, expected struct instance or 0"
                    )
            elif issubclass(self.dtype, ctypes.Array):
                # vector/matrix
                cvalue = self.dtype(value)
            else:
                # scalar
                if type(value) in warp.types.scalar_types:
                    value = value.value
                if self.dtype == float16:
                    cvalue = self.dtype._type_(float_to_half_bits(value))
                else:
                    cvalue = self.dtype._type_(value)
        except Exception as e:
            raise ValueError(f"Failed to convert the value to the array data type: {e}")

        cvalue_ptr = ctypes.pointer(cvalue)
        cvalue_size = ctypes.sizeof(cvalue)

        # prefer using memtile for contiguous arrays, because it should be faster than generic fill
        if self.is_contiguous:
            self.device.memtile(self.ptr, cvalue_ptr, cvalue_size, self.size)
        else:
            carr = self.__ctype__()
            carr_ptr = ctypes.pointer(carr)

            if self.device.is_cuda:
                warp.context.runtime.core.array_fill_device(
                    self.device.context, carr_ptr, ARRAY_TYPE_REGULAR, cvalue_ptr, cvalue_size
                )
            else:
                warp.context.runtime.core.array_fill_host(carr_ptr, ARRAY_TYPE_REGULAR, cvalue_ptr, cvalue_size)

    def assign(self, src):
        """Wraps ``src`` in an :class:`warp.array` if it is not already one and copies the contents to ``self``."""
        if is_array(src):
            warp.copy(self, src)
        else:
            warp.copy(self, array(data=src, dtype=self.dtype, copy=False, device="cpu"))

    def numpy(self):
        """Converts the array to a :class:`numpy.ndarray` (aliasing memory through the array interface protocol)
        If the array is on the GPU, a synchronous device-to-host copy (on the CUDA default stream) will be
        automatically performed to ensure that any outstanding work is completed.
        """
        if self.ptr:
            # use the CUDA default stream for synchronous behaviour with other streams
            with warp.ScopedStream(self.device.null_stream):
                a = self.to("cpu", requires_grad=False)
            # convert through __array_interface__
            # Note: this handles arrays of structs using `descr`, so the result will be a structured NumPy array
            return np.array(a, copy=False)
        else:
            # return an empty numpy array with the correct dtype and shape
            if isinstance(self.dtype, warp.codegen.Struct):
                npdtype = self.dtype.numpy_dtype()
                npshape = self.shape
            elif issubclass(self.dtype, ctypes.Array):
                npdtype = warp_type_to_np_dtype[self.dtype._wp_scalar_type_]
                npshape = (*self.shape, *self.dtype._shape_)
            else:
                npdtype = warp_type_to_np_dtype[self.dtype]
                npshape = self.shape
            return np.empty(npshape, dtype=npdtype)

    def cptr(self):
        """Return a ctypes cast of the array address.

        Notes:

        #. Only CPU arrays support this method.
        #. The array must be contiguous.
        #. Accesses to this object are **not** bounds checked.
        #. For ``float16`` types, a pointer to the internal ``uint16`` representation is returned.
        """
        if not self.ptr:
            return None

        if self.device != "cpu" or not self.is_contiguous:
            raise RuntimeError(
                "Accessing array memory through a ctypes ptr is only supported for contiguous CPU arrays."
            )

        if isinstance(self.dtype, warp.codegen.Struct):
            p = ctypes.cast(self.ptr, ctypes.POINTER(self.dtype.ctype))
        else:
            p = ctypes.cast(self.ptr, ctypes.POINTER(self.dtype._type_))

        # store backref to the underlying array to avoid it being deallocated
        p._ref = self

        return p

    def list(self):
        """Returns a flattened list of items in the array as a Python list."""
        a = self.numpy()

        if isinstance(self.dtype, warp.codegen.Struct):
            # struct
            a = a.flatten()
            data = a.ctypes.data
            stride = a.strides[0]
            return [self.dtype.from_ptr(data + i * stride) for i in range(self.size)]
        elif issubclass(self.dtype, ctypes.Array):
            # vector/matrix - flatten, but preserve inner vector/matrix dimensions
            a = a.reshape((self.size, *self.dtype._shape_))
            data = a.ctypes.data
            stride = a.strides[0]
            return [self.dtype.from_ptr(data + i * stride) for i in range(self.size)]
        else:
            # scalar
            return list(a.flatten())

    def to(self, device, requires_grad=None):
        """Returns a Warp array with this array's data moved to the specified device, no-op if already on device."""
        device = warp.get_device(device)
        if self.device == device:
            return self
        else:
            return warp.clone(self, device=device, requires_grad=requires_grad)

    def flatten(self):
        """Returns a zero-copy view of the array collapsed to 1-D. Only supported for contiguous arrays."""
        if self.ndim == 1:
            return self

        if not self.is_contiguous:
            raise RuntimeError("Flattening non-contiguous arrays is unsupported.")

        a = array(
            ptr=self.ptr,
            dtype=self.dtype,
            shape=(self.size,),
            device=self.device,
            pinned=self.pinned,
            copy=False,
            owner=False,
            grad=None if self.grad is None else self.grad.flatten(),
        )

        # store back-ref to stop data being destroyed
        a._ref = self
        return a

    def reshape(self, shape):
        """Returns a reshaped array. Only supported for contiguous arrays.

        Args:
            shape : An int or tuple of ints specifying the shape of the returned array.
        """
        if not self.is_contiguous:
            raise RuntimeError("Reshaping non-contiguous arrays is unsupported.")

        # convert shape to tuple
        if shape is None:
            raise RuntimeError("shape parameter is required.")
        if isinstance(shape, int):
            shape = (shape,)
        elif not isinstance(shape, tuple):
            shape = tuple(shape)

        if len(shape) > ARRAY_MAX_DIMS:
            raise RuntimeError(
                f"Arrays may only have {ARRAY_MAX_DIMS} dimensions maximum, trying to create array with {len(shape)} dims."
            )

        # check for -1 dimension and reformat
        if -1 in shape:
            idx = self.size
            denom = 1
            minus_one_count = 0
            for i, d in enumerate(shape):
                if d == -1:
                    idx = i
                    minus_one_count += 1
                else:
                    denom *= d
            if minus_one_count > 1:
                raise RuntimeError("Cannot infer shape if more than one index is -1.")
            new_shape = list(shape)
            new_shape[idx] = int(self.size / denom)
            shape = tuple(new_shape)

        size = 1
        for d in shape:
            size *= d

        if size != self.size:
            raise RuntimeError("Reshaped array must have the same total size as the original.")

        a = array(
            ptr=self.ptr,
            dtype=self.dtype,
            shape=shape,
            strides=None,
            device=self.device,
            pinned=self.pinned,
            copy=False,
            owner=False,
            grad=None if self.grad is None else self.grad.reshape(shape),
        )

        # store back-ref to stop data being destroyed
        a._ref = self
        return a

    def view(self, dtype):
        """Returns a zero-copy view of this array's memory with a different data type.
        ``dtype`` must have the same byte size of the array's native ``dtype``.
        """
        if type_size_in_bytes(dtype) != type_size_in_bytes(self.dtype):
            raise RuntimeError("Cannot cast dtypes of unequal byte size")

        # return an alias of the array memory with different type information
        a = array(
            ptr=self.ptr,
            dtype=dtype,
            shape=self.shape,
            strides=self.strides,
            device=self.device,
            pinned=self.pinned,
            copy=False,
            owner=False,
            grad=None if self.grad is None else self.grad.view(dtype),
        )

        a._ref = self
        return a

    def contiguous(self):
        """Returns a contiguous array with this array's data. No-op if array is already contiguous."""
        if self.is_contiguous:
            return self

        a = warp.empty_like(self)
        warp.copy(a, self)
        return a

    def transpose(self, axes=None):
        """Returns an zero-copy view of the array with axes transposed.

        Note: The transpose operation will return an array with a non-contiguous access pattern.

        Args:
            axes (optional): Specifies the how the axes are permuted. If not specified, the axes order will be reversed.
        """
        # noop if 1d array
        if self.ndim == 1:
            return self

        if axes is None:
            # reverse the order of the axes
            axes = range(self.ndim)[::-1]
        elif len(axes) != len(self.shape):
            raise RuntimeError("Length of parameter axes must be equal in length to array shape")

        shape = []
        strides = []
        for a in axes:
            if not isinstance(a, int):
                raise RuntimeError(f"axis index {a} is not of type int")
            if a >= len(self.shape):
                raise RuntimeError(f"axis index {a} must be smaller than the number of axes in array")
            shape.append(self.shape[a])
            strides.append(self.strides[a])

        a = array(
            ptr=self.ptr,
            dtype=self.dtype,
            shape=tuple(shape),
            strides=tuple(strides),
            device=self.device,
            pinned=self.pinned,
            copy=False,
            owner=False,
            grad=None if self.grad is None else self.grad.transpose(axes=axes),
        )

        a.is_transposed = not self.is_transposed

        a._ref = self
        return a


# aliases for arrays with small dimensions
def array1d(*args, **kwargs):
    kwargs["ndim"] = 1
    return array(*args, **kwargs)


# equivalent to calling array(..., ndim=2)
def array2d(*args, **kwargs):
    kwargs["ndim"] = 2
    return array(*args, **kwargs)


# equivalent to calling array(..., ndim=3)
def array3d(*args, **kwargs):
    kwargs["ndim"] = 3
    return array(*args, **kwargs)


# equivalent to calling array(..., ndim=4)
def array4d(*args, **kwargs):
    kwargs["ndim"] = 4
    return array(*args, **kwargs)


# TODO: Rewrite so that we take only shape, not length and optional shape
def from_ptr(ptr, length, dtype=None, shape=None, device=None):
    return array(
        dtype=dtype,
        length=length,
        capacity=length * type_size_in_bytes(dtype),
        ptr=0 if ptr == 0 else ctypes.cast(ptr, ctypes.POINTER(ctypes.c_size_t)).contents.value,
        shape=shape,
        device=device,
        owner=False,
        requires_grad=False,
    )


# A base class for non-contiguous arrays, providing the implementation of common methods like
# contiguous(), to(), numpy(), list(), assign(), zero_(), and fill_().
class noncontiguous_array_base(Generic[T]):
    def __init__(self, array_type_id):
        self.type_id = array_type_id
        self.is_contiguous = False

    # return a contiguous copy
    def contiguous(self):
        a = warp.empty_like(self)
        warp.copy(a, self)
        return a

    # copy data from one device to another, nop if already on device
    def to(self, device):
        device = warp.get_device(device)
        if self.device == device:
            return self
        else:
            return warp.clone(self, device=device)

    # return a contiguous numpy copy
    def numpy(self):
        # use the CUDA default stream for synchronous behaviour with other streams
        with warp.ScopedStream(self.device.null_stream):
            return self.contiguous().numpy()

    # returns a flattened list of items in the array as a Python list
    def list(self):
        # use the CUDA default stream for synchronous behaviour with other streams
        with warp.ScopedStream(self.device.null_stream):
            return self.contiguous().list()

    # equivalent to wrapping src data in an array and copying to self
    def assign(self, src):
        if is_array(src):
            warp.copy(self, src)
        else:
            warp.copy(self, array(data=src, dtype=self.dtype, copy=False, device="cpu"))

    def zero_(self):
        self.fill_(0)

    def fill_(self, value):
        if self.size == 0:
            return

        # try to convert the given value to the array dtype
        try:
            if isinstance(self.dtype, warp.codegen.Struct):
                if isinstance(value, self.dtype.cls):
                    cvalue = value.__ctype__()
                elif value == 0:
                    # allow zero-initializing structs using default constructor
                    cvalue = self.dtype().__ctype__()
                else:
                    raise ValueError(
                        f"Invalid initializer value for struct {self.dtype.cls.__name__}, expected struct instance or 0"
                    )
            elif issubclass(self.dtype, ctypes.Array):
                # vector/matrix
                cvalue = self.dtype(value)
            else:
                # scalar
                if type(value) in warp.types.scalar_types:
                    value = value.value
                if self.dtype == float16:
                    cvalue = self.dtype._type_(float_to_half_bits(value))
                else:
                    cvalue = self.dtype._type_(value)
        except Exception as e:
            raise ValueError(f"Failed to convert the value to the array data type: {e}")

        cvalue_ptr = ctypes.pointer(cvalue)
        cvalue_size = ctypes.sizeof(cvalue)

        ctype = self.__ctype__()
        ctype_ptr = ctypes.pointer(ctype)

        if self.device.is_cuda:
            warp.context.runtime.core.array_fill_device(
                self.device.context, ctype_ptr, self.type_id, cvalue_ptr, cvalue_size
            )
        else:
            warp.context.runtime.core.array_fill_host(ctype_ptr, self.type_id, cvalue_ptr, cvalue_size)


# helper to check index array properties
def check_index_array(indices, expected_device):
    if not isinstance(indices, array):
        raise ValueError(f"Indices must be a Warp array, got {type(indices)}")
    if indices.ndim != 1:
        raise ValueError(f"Index array must be one-dimensional, got {indices.ndim}")
    if indices.dtype != int32:
        raise ValueError(f"Index array must use int32, got dtype {indices.dtype}")
    if indices.device != expected_device:
        raise ValueError(f"Index array device ({indices.device} does not match data array device ({expected_device}))")


class indexedarray(noncontiguous_array_base[T]):
    # member attributes available during code-gen (e.g.: d = arr.shape[0])
    # (initialized when needed)
    _vars = None

    def __init__(self, data: array = None, indices: Union[array, List[array]] = None, dtype=None, ndim=None):
        super().__init__(ARRAY_TYPE_INDEXED)

        # canonicalize types
        if dtype is not None:
            if dtype == int:
                dtype = int32
            elif dtype == float:
                dtype = float32

        self.data = data
        self.indices = [None] * ARRAY_MAX_DIMS

        if data is not None:
            if not isinstance(data, array):
                raise ValueError("Indexed array data must be a Warp array")
            if dtype is not None and dtype != data.dtype:
                raise ValueError(f"Requested dtype ({dtype}) does not match dtype of data array ({data.dtype})")
            if ndim is not None and ndim != data.ndim:
                raise ValueError(
                    f"Requested dimensionality ({ndim}) does not match dimensionality of data array ({data.ndim})"
                )

            self.dtype = data.dtype
            self.ndim = data.ndim
            self.device = data.device
            self.pinned = data.pinned

            # determine shape from original data shape and index counts
            shape = list(data.shape)

            if indices is not None:
                if isinstance(indices, (list, tuple)):
                    if len(indices) > self.ndim:
                        raise ValueError(
                            f"Number of indices provided ({len(indices)}) exceeds number of dimensions ({self.ndim})"
                        )

                    for i in range(len(indices)):
                        if indices[i] is not None:
                            check_index_array(indices[i], data.device)
                            self.indices[i] = indices[i]
                            shape[i] = len(indices[i])

                elif isinstance(indices, array):
                    # only a single index array was provided
                    check_index_array(indices, data.device)
                    self.indices[0] = indices
                    shape[0] = len(indices)

                else:
                    raise ValueError("Indices must be a single Warp array or a list of Warp arrays")

            self.shape = tuple(shape)

        else:
            # allow empty indexedarrays in type annotations
            self.dtype = dtype
            self.ndim = ndim or 1
            self.device = None
            self.pinned = False
            self.shape = (0,) * self.ndim

        # update size (num elements)
        self.size = 1
        for d in self.shape:
            self.size *= d

    def __len__(self):
        return self.shape[0]

    def __str__(self):
        if self.device is None:
            # type annotation
            return f"indexedarray{self.dtype}"
        else:
            return str(self.numpy())

    # construct a C-representation of the array for passing to kernels
    def __ctype__(self):
        return indexedarray_t(self.data, self.indices, self.shape)

    @property
    def vars(self):
        # member attributes available during code-gen (e.g.: d = arr.shape[0])
        # Note: we use a shared dict for all indexedarray instances
        if indexedarray._vars is None:
            indexedarray._vars = {"shape": warp.codegen.Var("shape", shape_t)}
        return indexedarray._vars


# aliases for indexedarrays with small dimensions
def indexedarray1d(*args, **kwargs):
    kwargs["ndim"] = 1
    return indexedarray(*args, **kwargs)


# equivalent to calling indexedarray(..., ndim=2)
def indexedarray2d(*args, **kwargs):
    kwargs["ndim"] = 2
    return indexedarray(*args, **kwargs)


# equivalent to calling indexedarray(..., ndim=3)
def indexedarray3d(*args, **kwargs):
    kwargs["ndim"] = 3
    return indexedarray(*args, **kwargs)


# equivalent to calling indexedarray(..., ndim=4)
def indexedarray4d(*args, **kwargs):
    kwargs["ndim"] = 4
    return indexedarray(*args, **kwargs)


from warp.fabric import fabricarray, indexedfabricarray  # noqa: E402

array_types = (array, indexedarray, fabricarray, indexedfabricarray)


def array_type_id(a):
    if isinstance(a, array):
        return ARRAY_TYPE_REGULAR
    elif isinstance(a, indexedarray):
        return ARRAY_TYPE_INDEXED
    elif isinstance(a, fabricarray):
        return ARRAY_TYPE_FABRIC
    elif isinstance(a, indexedfabricarray):
        return ARRAY_TYPE_FABRIC_INDEXED
    else:
        raise ValueError("Invalid array type")


class Bvh:
    def __init__(self, lowers, uppers):
        """Class representing a bounding volume hierarchy.

        Attributes:
            id: Unique identifier for this bvh object, can be passed to kernels.
            device: Device this object lives on, all buffers must live on the same device.

        Args:
            lowers (:class:`warp.array`): Array of lower bounds :class:`warp.vec3`
            uppers (:class:`warp.array`): Array of upper bounds :class:`warp.vec3`
        """

        if len(lowers) != len(uppers):
            raise RuntimeError("Bvh the same number of lower and upper bounds must be provided")

        if lowers.device != uppers.device:
            raise RuntimeError("Bvh lower and upper bounds must live on the same device")

        if lowers.dtype != vec3 or not lowers.is_contiguous:
            raise RuntimeError("Bvh lowers should be a contiguous array of type wp.vec3")

        if uppers.dtype != vec3 or not uppers.is_contiguous:
            raise RuntimeError("Bvh uppers should be a contiguous array of type wp.vec3")

        self.device = lowers.device
        self.lowers = lowers
        self.uppers = uppers

        def get_data(array):
            if array:
                return ctypes.c_void_p(array.ptr)
            else:
                return ctypes.c_void_p(0)

        from warp.context import runtime

        if self.device.is_cpu:
            self.id = runtime.core.bvh_create_host(get_data(lowers), get_data(uppers), int(len(lowers)))
        else:
            self.id = runtime.core.bvh_create_device(
                self.device.context, get_data(lowers), get_data(uppers), int(len(lowers))
            )

    def __del__(self):
        try:
            from warp.context import runtime

            if self.device.is_cpu:
                runtime.core.bvh_destroy_host(self.id)
            else:
                # use CUDA context guard to avoid side effects during garbage collection
                with self.device.context_guard:
                    runtime.core.bvh_destroy_device(self.id)

        except Exception:
            pass

    def refit(self):
        """Refit the BVH. This should be called after users modify the `lowers` and `uppers` arrays."""

        from warp.context import runtime

        if self.device.is_cpu:
            runtime.core.bvh_refit_host(self.id)
        else:
            runtime.core.bvh_refit_device(self.id)
            runtime.verify_cuda_device(self.device)


class Mesh:
    from warp.codegen import Var

    vars = {
        "points": Var("points", array(dtype=vec3)),
        "velocities": Var("velocities", array(dtype=vec3)),
        "indices": Var("indices", array(dtype=int32)),
    }

    def __init__(self, points=None, indices=None, velocities=None, support_winding_number=False):
        """Class representing a triangle mesh.

        Attributes:
            id: Unique identifier for this mesh object, can be passed to kernels.
            device: Device this object lives on, all buffers must live on the same device.

        Args:
            points (:class:`warp.array`): Array of vertex positions of type :class:`warp.vec3`
            indices (:class:`warp.array`): Array of triangle indices of type :class:`warp.int32`, should be a 1d array with shape (num_tris, 3)
            velocities (:class:`warp.array`): Array of vertex velocities of type :class:`warp.vec3` (optional)
            support_winding_number (bool): If true the mesh will build additional datastructures to support `wp.mesh_query_point_sign_winding_number()` queries
        """

        if points.device != indices.device:
            raise RuntimeError("Mesh points and indices must live on the same device")

        if points.dtype != vec3 or not points.is_contiguous:
            raise RuntimeError("Mesh points should be a contiguous array of type wp.vec3")

        if velocities and (velocities.dtype != vec3 or not velocities.is_contiguous):
            raise RuntimeError("Mesh velocities should be a contiguous array of type wp.vec3")

        if indices.dtype != int32 or not indices.is_contiguous:
            raise RuntimeError("Mesh indices should be a contiguous array of type wp.int32")

        if indices.ndim > 1:
            raise RuntimeError("Mesh indices should be a flattened 1d array of indices")

        self.device = points.device
        self.points = points
        self.velocities = velocities
        self.indices = indices

        from warp.context import runtime

        if self.device.is_cpu:
            self.id = runtime.core.mesh_create_host(
                points.__ctype__(),
                velocities.__ctype__() if velocities else array().__ctype__(),
                indices.__ctype__(),
                int(len(points)),
                int(indices.size / 3),
                int(support_winding_number),
            )
        else:
            self.id = runtime.core.mesh_create_device(
                self.device.context,
                points.__ctype__(),
                velocities.__ctype__() if velocities else array().__ctype__(),
                indices.__ctype__(),
                int(len(points)),
                int(indices.size / 3),
                int(support_winding_number),
            )

    def __del__(self):
        try:
            from warp.context import runtime

            if self.device.is_cpu:
                runtime.core.mesh_destroy_host(self.id)
            else:
                # use CUDA context guard to avoid side effects during garbage collection
                with self.device.context_guard:
                    runtime.core.mesh_destroy_device(self.id)
        except Exception:
            pass

    def refit(self):
        """Refit the BVH to points. This should be called after users modify the `points` data."""

        from warp.context import runtime

        if self.device.is_cpu:
            runtime.core.mesh_refit_host(self.id)
        else:
            runtime.core.mesh_refit_device(self.id)
            runtime.verify_cuda_device(self.device)


class Volume:
    #: Enum value to specify nearest-neighbor interpolation during sampling
    CLOSEST = constant(0)
    #: Enum value to specify trilinear interpolation during sampling
    LINEAR = constant(1)

    def __init__(self, data: array):
        """Class representing a sparse grid.

        Args:
            data (:class:`warp.array`): Array of bytes representing the volume in NanoVDB format
        """

        self.id = 0

        from warp.context import runtime

        self.context = runtime

        if data is None:
            return

        if data.device is None:
            raise RuntimeError("Invalid device")
        self.device = data.device

        if self.device.is_cpu:
            self.id = self.context.core.volume_create_host(ctypes.cast(data.ptr, ctypes.c_void_p), data.size)
        else:
            self.id = self.context.core.volume_create_device(
                self.device.context, ctypes.cast(data.ptr, ctypes.c_void_p), data.size
            )

        if self.id == 0:
            raise RuntimeError("Failed to create volume from input array")

    def __del__(self):
        if self.id == 0:
            return

        try:
            from warp.context import runtime

            if self.device.is_cpu:
                runtime.core.volume_destroy_host(self.id)
            else:
                # use CUDA context guard to avoid side effects during garbage collection
                with self.device.context_guard:
                    runtime.core.volume_destroy_device(self.id)

        except Exception:
            pass

    def array(self) -> array:
        """Returns the raw memory buffer of the Volume as an array"""
        buf = ctypes.c_void_p(0)
        size = ctypes.c_uint64(0)
        if self.device.is_cpu:
            self.context.core.volume_get_buffer_info_host(self.id, ctypes.byref(buf), ctypes.byref(size))
        else:
            self.context.core.volume_get_buffer_info_device(self.id, ctypes.byref(buf), ctypes.byref(size))
        return array(ptr=buf.value, dtype=uint8, shape=size.value, device=self.device, owner=False)

    def get_tiles(self) -> array:
        if self.id == 0:
            raise RuntimeError("Invalid Volume")

        buf = ctypes.c_void_p(0)
        size = ctypes.c_uint64(0)
        if self.device.is_cpu:
            self.context.core.volume_get_tiles_host(self.id, ctypes.byref(buf), ctypes.byref(size))
        else:
            self.context.core.volume_get_tiles_device(self.id, ctypes.byref(buf), ctypes.byref(size))
        num_tiles = size.value // (3 * 4)
        return array(ptr=buf.value, dtype=int32, shape=(num_tiles, 3), device=self.device, owner=True)

    def get_voxel_size(self) -> Tuple[float, float, float]:
        if self.id == 0:
            raise RuntimeError("Invalid Volume")

        dx, dy, dz = ctypes.c_float(0), ctypes.c_float(0), ctypes.c_float(0)
        self.context.core.volume_get_voxel_size(self.id, ctypes.byref(dx), ctypes.byref(dy), ctypes.byref(dz))
        return (dx.value, dy.value, dz.value)

    @classmethod
    def load_from_nvdb(cls, file_or_buffer, device=None) -> Volume:
        """Creates a Volume object from a NanoVDB file or in-memory buffer.

        Returns:

            A ``warp.Volume`` object.
        """
        try:
            data = file_or_buffer.read()
        except AttributeError:
            data = file_or_buffer

        magic, version, grid_count, codec = struct.unpack("<QIHH", data[0:16])
        if magic != 0x304244566F6E614E:
            raise RuntimeError("NanoVDB signature not found")
        if version >> 21 != 32:  # checking major version
            raise RuntimeError("Unsupported NanoVDB version")
        if grid_count != 1:
            raise RuntimeError("Only NVDBs with exactly one grid are supported")

        grid_data_offset = 192 + struct.unpack("<I", data[152:156])[0]
        if codec == 0:  # no compression
            grid_data = data[grid_data_offset:]
        elif codec == 1:  # zip compression
            grid_data = zlib.decompress(data[grid_data_offset + 8 :])
        else:
            raise RuntimeError(f"Unsupported codec code: {codec}")

        magic = struct.unpack("<Q", grid_data[0:8])[0]
        if magic != 0x304244566F6E614E:
            raise RuntimeError("NanoVDB signature not found on grid!")

        data_array = array(np.frombuffer(grid_data, dtype=np.byte), device=device)
        return cls(data_array)

    @classmethod
    def load_from_numpy(
        cls, ndarray: np.array, min_world=(0.0, 0.0, 0.0), voxel_size=1.0, bg_value=0.0, device=None
    ) -> Volume:
        """Creates a Volume object from a dense 3D NumPy array.

        This function is only supported for CUDA devices.

        Args:
            min_world: The 3D coordinate of the lower corner of the volume.
            voxel_size: The size of each voxel in spatial coordinates.
            bg_value: Background value
            device: The CUDA device to create the volume on, e.g.: "cuda" or "cuda:0".

        Returns:

            A ``warp.Volume`` object.
        """

        import math

        target_shape = (
            math.ceil(ndarray.shape[0] / 8) * 8,
            math.ceil(ndarray.shape[1] / 8) * 8,
            math.ceil(ndarray.shape[2] / 8) * 8,
        )
        if hasattr(bg_value, "__len__"):
            # vec3, assuming the numpy array is 4D
            padded_array = np.array((target_shape[0], target_shape[1], target_shape[2], 3), dtype=np.single)
            padded_array[:, :, :, :] = np.array(bg_value)
            padded_array[0 : ndarray.shape[0], 0 : ndarray.shape[1], 0 : ndarray.shape[2], :] = ndarray
        else:
            padded_amount = (
                math.ceil(ndarray.shape[0] / 8) * 8 - ndarray.shape[0],
                math.ceil(ndarray.shape[1] / 8) * 8 - ndarray.shape[1],
                math.ceil(ndarray.shape[2] / 8) * 8 - ndarray.shape[2],
            )
            padded_array = np.pad(
                ndarray,
                ((0, padded_amount[0]), (0, padded_amount[1]), (0, padded_amount[2])),
                mode="constant",
                constant_values=bg_value,
            )

        shape = padded_array.shape
        volume = warp.Volume.allocate(
            min_world,
            [
                min_world[0] + (shape[0] - 1) * voxel_size,
                min_world[1] + (shape[1] - 1) * voxel_size,
                min_world[2] + (shape[2] - 1) * voxel_size,
            ],
            voxel_size,
            bg_value=bg_value,
            points_in_world_space=True,
            translation=min_world,
            device=device,
        )

        # Populate volume
        if hasattr(bg_value, "__len__"):
            warp.launch(
                warp.utils.copy_dense_volume_to_nano_vdb_v,
                dim=(shape[0], shape[1], shape[2]),
                inputs=[volume.id, warp.array(padded_array, dtype=warp.vec3, device=device)],
                device=device,
            )
        elif isinstance(bg_value, int):
            warp.launch(
                warp.utils.copy_dense_volume_to_nano_vdb_i,
                dim=shape,
                inputs=[volume.id, warp.array(padded_array, dtype=warp.int32, device=device)],
                device=device,
            )
        else:
            warp.launch(
                warp.utils.copy_dense_volume_to_nano_vdb_f,
                dim=shape,
                inputs=[volume.id, warp.array(padded_array, dtype=warp.float32, device=device)],
                device=device,
            )

        return volume

    @classmethod
    def allocate(
        cls,
        min: List[int],
        max: List[int],
        voxel_size: float,
        bg_value=0.0,
        translation=(0.0, 0.0, 0.0),
        points_in_world_space=False,
        device=None,
    ) -> Volume:
        """Allocate a new Volume based on the bounding box defined by min and max.

        This function is only supported for CUDA devices.

        Allocate a volume that is large enough to contain voxels [min[0], min[1], min[2]] - [max[0], max[1], max[2]], inclusive.
        If points_in_world_space is true, then min and max are first converted to index space with the given voxel size and
        translation, and the volume is allocated with those.

        The smallest unit of allocation is a dense tile of 8x8x8 voxels, the requested bounding box is rounded up to tiles, and
        the resulting tiles will be available in the new volume.

        Args:
            min (array-like): Lower 3D coordinates of the bounding box in index space or world space, inclusive.
            max (array-like): Upper 3D coordinates of the bounding box in index space or world space, inclusive.
            voxel_size (float): Voxel size of the new volume.
            bg_value (float or array-like): Value of unallocated voxels of the volume, also defines the volume's type, a :class:`warp.vec3` volume is created if this is `array-like`, otherwise a float volume is created
            translation (array-like): translation between the index and world spaces.
            device (Devicelike): The CUDA device to create the volume on, e.g.: "cuda" or "cuda:0".

        """
        if points_in_world_space:
            min = np.around((np.array(min, dtype=np.float32) - translation) / voxel_size)
            max = np.around((np.array(max, dtype=np.float32) - translation) / voxel_size)

        tile_min = np.array(min, dtype=np.int32) // 8
        tile_max = np.array(max, dtype=np.int32) // 8
        tiles = np.array(
            [
                [i, j, k]
                for i in range(tile_min[0], tile_max[0] + 1)
                for j in range(tile_min[1], tile_max[1] + 1)
                for k in range(tile_min[2], tile_max[2] + 1)
            ],
            dtype=np.int32,
        )
        tile_points = array(tiles * 8, device=device)

        return cls.allocate_by_tiles(tile_points, voxel_size, bg_value, translation, device)

    @classmethod
    def allocate_by_tiles(
        cls, tile_points: array, voxel_size: float, bg_value=0.0, translation=(0.0, 0.0, 0.0), device=None
    ) -> Volume:
        """Allocate a new Volume with active tiles for each point tile_points.

        This function is only supported for CUDA devices.

        The smallest unit of allocation is a dense tile of 8x8x8 voxels.
        This is the primary method for allocating sparse volumes. It uses an array of points indicating the tiles that must be allocated.

        Example use cases:
            * `tile_points` can mark tiles directly in index space as in the case this method is called by `allocate`.
            * `tile_points` can be a list of points used in a simulation that needs to transfer data to a volume.

        Args:
            tile_points (:class:`warp.array`): Array of positions that define the tiles to be allocated.
                The array can be a 2D, N-by-3 array of :class:`warp.int32` values, indicating index space positions,
                or can be a 1D array of :class:`warp.vec3` values, indicating world space positions.
                Repeated points per tile are allowed and will be efficiently deduplicated.
            voxel_size (float): Voxel size of the new volume.
            bg_value (float or array-like): Value of unallocated voxels of the volume, also defines the volume's type, a :class:`warp.vec3` volume is created if this is `array-like`, otherwise a float volume is created
            translation (array-like): Translation between the index and world spaces.
            device (Devicelike): The CUDA device to create the volume on, e.g.: "cuda" or "cuda:0".

        """
        from warp.context import runtime

        device = runtime.get_device(device)

        if voxel_size <= 0.0:
            raise RuntimeError(f"Voxel size must be positive! Got {voxel_size}")
        if not device.is_cuda:
            raise RuntimeError("Only CUDA devices are supported for allocate_by_tiles")
        if not (
            isinstance(tile_points, array)
            and (tile_points.dtype == int32 and tile_points.ndim == 2)
            or (tile_points.dtype == vec3 and tile_points.ndim == 1)
        ):
            raise RuntimeError("Expected an warp array of vec3s or of n-by-3 int32s as tile_points!")
        if not tile_points.device.is_cuda:
            tile_points = array(tile_points, dtype=tile_points.dtype, device=device)

        volume = cls(data=None)
        volume.device = device
        in_world_space = tile_points.dtype == vec3
        if hasattr(bg_value, "__len__"):
            volume.id = volume.context.core.volume_v_from_tiles_device(
                volume.device.context,
                ctypes.c_void_p(tile_points.ptr),
                tile_points.shape[0],
                voxel_size,
                bg_value[0],
                bg_value[1],
                bg_value[2],
                translation[0],
                translation[1],
                translation[2],
                in_world_space,
            )
        elif isinstance(bg_value, int):
            volume.id = volume.context.core.volume_i_from_tiles_device(
                volume.device.context,
                ctypes.c_void_p(tile_points.ptr),
                tile_points.shape[0],
                voxel_size,
                bg_value,
                translation[0],
                translation[1],
                translation[2],
                in_world_space,
            )
        else:
            volume.id = volume.context.core.volume_f_from_tiles_device(
                volume.device.context,
                ctypes.c_void_p(tile_points.ptr),
                tile_points.shape[0],
                voxel_size,
                float(bg_value),
                translation[0],
                translation[1],
                translation[2],
                in_world_space,
            )

        if volume.id == 0:
            raise RuntimeError("Failed to create volume")

        return volume


# definition just for kernel type (cannot be a parameter), see mesh.h
# NOTE: its layout must match the corresponding struct defined in C.
# NOTE: it needs to be defined after `indexedarray` to workaround a circular import issue.
class mesh_query_point_t:
    """Output for the mesh query point functions.

    Attributes:
        result (bool): Whether a point is found within the given constraints.
        sign (float32): A value < 0 if query point is inside the mesh, >=0 otherwise.
                        Note that mesh must be watertight for this to be robust
        face (int32): Index of the closest face.
        u (float32): Barycentric u coordinate of the closest point.
        v (float32): Barycentric v coordinate of the closest point.

    See Also:
        :func:`mesh_query_point`, :func:`mesh_query_point_no_sign`,
        :func:`mesh_query_furthest_point_no_sign`,
        :func:`mesh_query_point_sign_normal`,
        and :func:`mesh_query_point_sign_winding_number`.
    """
    from warp.codegen import Var

    vars = {
        "result": Var("result", bool),
        "sign": Var("sign", float32),
        "face": Var("face", int32),
        "u": Var("u", float32),
        "v": Var("v", float32),
    }


# definition just for kernel type (cannot be a parameter), see mesh.h
# NOTE: its layout must match the corresponding struct defined in C.
class mesh_query_ray_t:
    """Output for the mesh query ray functions.

    Attributes:
        result (bool): Whether a hit is found within the given constraints.
        sign (float32): A value > 0 if the ray hit in front of the face, returns < 0 otherwise.
        face (int32): Index of the closest face.
        t (float32): Distance of the closest hit along the ray.
        u (float32): Barycentric u coordinate of the closest hit.
        v (float32): Barycentric v coordinate of the closest hit.
        normal (vec3f): Face normal.

    See Also:
        :func:`mesh_query_ray`.
    """
    from warp.codegen import Var

    vars = {
        "result": Var("result", bool),
        "sign": Var("sign", float32),
        "face": Var("face", int32),
        "t": Var("t", float32),
        "u": Var("u", float32),
        "v": Var("v", float32),
        "normal": Var("normal", vec3),
    }


def matmul(
    a: array2d,
    b: array2d,
    c: array2d,
    d: array2d,
    alpha: float = 1.0,
    beta: float = 0.0,
    allow_tf32x3_arith: builtins.bool = False,
    device=None,
):
    """Computes a generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.

    Args:
        a (array2d): two-dimensional array containing matrix A
        b (array2d): two-dimensional array containing matrix B
        c (array2d): two-dimensional array containing matrix C
        d (array2d): two-dimensional array to which output D is written
        alpha (float): parameter alpha of GEMM
        beta (float): parameter beta of GEMM
        allow_tf32x3_arith (bool): whether to use CUTLASS's 3xTF32 GEMMs, which enable accuracy similar to FP32
                                   while using Tensor Cores
        device: device we want to use to multiply matrices. Defaults to active runtime device. If "cpu", resorts to using numpy multiplication.
    """
    from warp.context import runtime

    if device is None:
        device = runtime.get_device(device)

    if a.device != device or b.device != device or c.device != device or d.device != device:
        raise RuntimeError("Matrices A, B, C, and D must all be on the same device as the runtime device.")

    if a.dtype != b.dtype or a.dtype != c.dtype or a.dtype != d.dtype:
        raise RuntimeError(
            "wp.matmul currently only supports operation between {A, B, C, D} matrices of the same type."
        )

    if (not a.is_contiguous and not a.is_transposed) or (not b.is_contiguous and not b.is_transposed) or (not c.is_contiguous) or (not d.is_contiguous):
        raise RuntimeError(
            "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B may be transposed."
        )

    m = a.shape[0]
    n = b.shape[1]
    k = a.shape[1]
    if b.shape != (k, n) or c.shape != (m, n) or d.shape != (m, n):
        raise RuntimeError(
            "Invalid shapes for matrices: A = {} B = {} C = {} D = {}".format(a.shape, b.shape, c.shape, d.shape)
        )

    if runtime.tape:
        runtime.tape.record_func(
            backward=lambda: adj_matmul(
                a, b, c, a.grad, b.grad, c.grad, d.grad, alpha, beta, allow_tf32x3_arith, device
            ),
            arrays=[a, b, c, d],
        )

    # cpu fallback if no cuda devices found
    if device == "cpu":
        d.assign(alpha * (a.numpy() @ b.numpy()) + beta * c.numpy())
        return

    cc = device.arch
    ret = runtime.core.cutlass_gemm(
        cc,
        m,
        n,
        k,
        type_typestr(a.dtype).encode(),
        ctypes.c_void_p(a.ptr),
        ctypes.c_void_p(b.ptr),
        ctypes.c_void_p(c.ptr),
        ctypes.c_void_p(d.ptr),
        alpha,
        beta,
        not a.is_transposed,
        not b.is_transposed,
        allow_tf32x3_arith,
        1,
    )
    if not ret:
        raise RuntimeError("matmul failed.")


def adj_matmul(
    a: array2d,
    b: array2d,
    c: array2d,
    adj_a: array2d,
    adj_b: array2d,
    adj_c: array2d,
    adj_d: array2d,
    alpha: float = 1.0,
    beta: float = 0.0,
    allow_tf32x3_arith: builtins.bool = False,
    device=None,
):
    """Computes the adjoint of a generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.
        note: the adjoint of parameter alpha is not included but can be computed as `adj_alpha = np.sum(np.concatenate(np.multiply(a @ b, adj_d)))`.
        note: the adjoint of parameter beta is not included but can be computed as `adj_beta = np.sum(np.concatenate(np.multiply(c, adj_d)))`.

    Args:
        a (array2d): two-dimensional array containing matrix A
        b (array2d): two-dimensional array containing matrix B
        c (array2d): two-dimensional array containing matrix C
        adj_a (array2d): two-dimensional array to which the adjoint of matrix A is written
        adj_b (array2d): two-dimensional array to which the adjoint of matrix B is written
        adj_c (array2d): two-dimensional array to which the adjoint of matrix C is written
        adj_d (array2d): two-dimensional array containing the adjoint of matrix D
        alpha (float): parameter alpha of GEMM
        beta (float): parameter beta of GEMM
        allow_tf32x3_arith (bool): whether to use CUTLASS's 3xTF32 GEMMs, which enable accuracy similar to FP32
                                   while using Tensor Cores
        device: device we want to use to multiply matrices. Defaults to active runtime device. If "cpu", resorts to using numpy multiplication.
    """
    from warp.context import runtime

    if device is None:
        device = runtime.get_device(device)

    if (
        a.device != device
        or b.device != device
        or c.device != device
        or adj_a.device != device
        or adj_b.device != device
        or adj_c.device != device
        or adj_d.device != device
    ):
        raise RuntimeError(
            "Matrices A, B, C, D, and their adjoints must all be on the same device as the runtime device."
        )

    if (
        a.dtype != b.dtype
        or a.dtype != c.dtype
        or a.dtype != adj_a.dtype
        or a.dtype != adj_b.dtype
        or a.dtype != adj_c.dtype
        or a.dtype != adj_d.dtype
    ):
        raise RuntimeError(
            "wp.adj_matmul currently only supports operation between {A, B, C, adj_D, adj_A, adj_B, adj_C} matrices of the same type."
        )

    if (
        (not a.is_contiguous and not a.is_transposed)
        or (not b.is_contiguous and not b.is_transposed)
        or (not c.is_contiguous)
        or (not adj_a.is_contiguous and not adj_a.is_transposed)
        or (not adj_b.is_contiguous and not adj_b.is_transposed)
        or (not adj_c.is_contiguous)
        or (not adj_d.is_contiguous)
    ):
        raise RuntimeError(
            "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B and their associated adjoints may be transposed."
        )
    
    m = a.shape[0]
    n = b.shape[1]
    k = a.shape[1]
    if (
        a.shape != (m, k)
        or b.shape != (k, n)
        or c.shape != (m, n)
        or adj_d.shape != (m, n)
        or adj_a.shape != (m, k)
        or adj_b.shape != (k, n)
        or adj_c.shape != (m, n)
    ):
        raise RuntimeError(
            "Invalid shapes for matrices: A = {} B = {} C = {} adj_D = {} adj_A = {} adj_B = {} adj_C = {}".format(
                a.shape, b.shape, c.shape, adj_d.shape, adj_a.shape, adj_b.shape, adj_c.shape
            )
        )

    # cpu fallback if no cuda devices found
    if device == "cpu":
        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose()) + adj_a.numpy())
        adj_b.assign(alpha * (a.numpy().transpose() @ adj_d.numpy()) + adj_b.numpy())
        adj_c.assign(beta * adj_d.numpy() + adj_c.numpy())
        return

    cc = device.arch

    # adj_a
    if not a.is_transposed:
        ret = runtime.core.cutlass_gemm(
            cc,
            m,
            k,
            n,
            type_typestr(a.dtype).encode(),
            ctypes.c_void_p(adj_d.ptr),
            ctypes.c_void_p(b.ptr),
            ctypes.c_void_p(adj_a.ptr),
            ctypes.c_void_p(adj_a.ptr),
            alpha,
            1.0,
            True,
            b.is_transposed,
            allow_tf32x3_arith,
            1,
        )
        if not ret:
            raise RuntimeError("adj_matmul failed.")
    else:
        ret = runtime.core.cutlass_gemm(
            cc,
            k,
            m,
            n,
            type_typestr(a.dtype).encode(),
            ctypes.c_void_p(b.ptr),
            ctypes.c_void_p(adj_d.ptr),
            ctypes.c_void_p(adj_a.ptr),
            ctypes.c_void_p(adj_a.ptr),
            alpha,
            1.0,
            not b.is_transposed,
            False,
            allow_tf32x3_arith,
            1,
        )
        if not ret:
            raise RuntimeError("adj_matmul failed.")

    # adj_b
    if not b.is_transposed:
        ret = runtime.core.cutlass_gemm(
            cc,
            k,
            n,
            m,
            type_typestr(a.dtype).encode(),
            ctypes.c_void_p(a.ptr),
            ctypes.c_void_p(adj_d.ptr),
            ctypes.c_void_p(adj_b.ptr),
            ctypes.c_void_p(adj_b.ptr),
            alpha,
            1.0,
            a.is_transposed,
            True,
            allow_tf32x3_arith,
            1,
        )
        if not ret:
            raise RuntimeError("adj_matmul failed.")
    else:
        ret = runtime.core.cutlass_gemm(
            cc,
            n,
            k,
            m,
            type_typestr(a.dtype).encode(),
            ctypes.c_void_p(adj_d.ptr),
            ctypes.c_void_p(a.ptr),
            ctypes.c_void_p(adj_b.ptr),
            ctypes.c_void_p(adj_b.ptr),
            alpha,
            1.0,
            False,
            not a.is_transposed,
            allow_tf32x3_arith,
            1,
        )
        if not ret:
            raise RuntimeError("adj_matmul failed.")        

    # adj_c
    warp.launch(
        kernel=warp.utils.add_kernel_2d,
        dim=adj_c.shape,
        inputs=[adj_c, adj_d, adj_d.dtype(beta)],
        device=device,
        record_tape=False
    )


def batched_matmul(
    a: array3d,
    b: array3d,
    c: array3d,
    d: array3d,
    alpha: float = 1.0,
    beta: float = 0.0,
    allow_tf32x3_arith: builtins.bool = False,
    device=None,
):
    """Computes a batched generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.

    Args:
        a (array3d): three-dimensional array containing A matrices. Overall array dimension is {batch_count, M, K}
        b (array3d): three-dimensional array containing B matrices. Overall array dimension is {batch_count, K, N}
        c (array3d): three-dimensional array containing C matrices. Overall array dimension is {batch_count, M, N}
        d (array3d): three-dimensional array to which output D is written. Overall array dimension is {batch_count, M, N}
        alpha (float): parameter alpha of GEMM
        beta (float): parameter beta of GEMM
        allow_tf32x3_arith (bool): whether to use CUTLASS's 3xTF32 GEMMs, which enable accuracy similar to FP32
                                   while using Tensor Cores
        device: device we want to use to multiply matrices. Defaults to active runtime device. If "cpu", resorts to using numpy multiplication.
    """
    from warp.context import runtime

    if device is None:
        device = runtime.get_device(device)

    if a.device != device or b.device != device or c.device != device or d.device != device:
        raise RuntimeError("Matrices A, B, C, and D must all be on the same device as the runtime device.")

    if a.dtype != b.dtype or a.dtype != c.dtype or a.dtype != d.dtype:
        raise RuntimeError(
            "wp.batched_matmul currently only supports operation between {A, B, C, D} matrices of the same type."
        )

    if (not a.is_contiguous and not a.is_transposed) or (not b.is_contiguous and not b.is_transposed) or (not c.is_contiguous) or (not d.is_contiguous):
        raise RuntimeError(
            "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B may be transposed."
        )

    m = a.shape[1]
    n = b.shape[2]
    k = a.shape[2]
    batch_count = a.shape[0]
    if b.shape != (batch_count, k, n) or c.shape != (batch_count, m, n) or d.shape != (batch_count, m, n):
        raise RuntimeError(
            "Invalid shapes for matrices: A = {} B = {} C = {} D = {}".format(a.shape, b.shape, c.shape, d.shape)
        )

    if runtime.tape:
        runtime.tape.record_func(
            backward=lambda: adj_batched_matmul(
                a, b, c, a.grad, b.grad, c.grad, d.grad, alpha, beta, allow_tf32x3_arith, device
            ),
            arrays=[a, b, c, d],
        )

    # cpu fallback if no cuda devices found
    if device == "cpu":
        d.assign(alpha * np.matmul(a.numpy(), b.numpy()) + beta * c.numpy())
        return

    # handle case in which batch_count exceeds max_batch_count, which is a CUDA array size maximum
    max_batch_count = 65535
    iters = int(batch_count / max_batch_count)
    remainder = batch_count % max_batch_count

    cc = device.arch
    for i in range(iters):
        idx_start = i * max_batch_count
        idx_end = (i + 1) * max_batch_count if i < iters - 1 else batch_count
        ret = runtime.core.cutlass_gemm(
            cc,
            m,
            n,
            k,
            type_typestr(a.dtype).encode(),
            ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
            ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
            ctypes.c_void_p(c[idx_start:idx_end,:,:].ptr),
            ctypes.c_void_p(d[idx_start:idx_end,:,:].ptr),
            alpha,
            beta,
            not a.is_transposed,
            not b.is_transposed,
            allow_tf32x3_arith,
            max_batch_count,
        )
        if not ret:
            raise RuntimeError("Batched matmul failed.")
    
    idx_start = iters * max_batch_count
    ret = runtime.core.cutlass_gemm(
        cc,
        m,
        n,
        k,
        type_typestr(a.dtype).encode(),
        ctypes.c_void_p(a[idx_start:,:,:].ptr),
        ctypes.c_void_p(b[idx_start:,:,:].ptr),
        ctypes.c_void_p(c[idx_start:,:,:].ptr),
        ctypes.c_void_p(d[idx_start:,:,:].ptr),
        alpha,
        beta,
        not a.is_transposed,
        not b.is_transposed,
        allow_tf32x3_arith,
        remainder,
    )
    if not ret:
        raise RuntimeError("Batched matmul failed.")    


def adj_batched_matmul(
    a: array3d,
    b: array3d,
    c: array3d,
    adj_a: array3d,
    adj_b: array3d,
    adj_c: array3d,
    adj_d: array3d,
    alpha: float = 1.0,
    beta: float = 0.0,
    allow_tf32x3_arith: builtins.bool = False,
    device=None,
):
    """Computes a batched generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.

    Args:
        a (array3d): three-dimensional array containing A matrices. Overall array dimension is {batch_count, M, K}
        b (array3d): three-dimensional array containing B matrices. Overall array dimension is {batch_count, K, N}
        c (array3d): three-dimensional array containing C matrices. Overall array dimension is {batch_count, M, N}
        adj_a (array3d): three-dimensional array to which the adjoints of A matrices are written. Overall array dimension is {batch_count, M, K}
        adj_b (array3d): three-dimensional array to which the adjoints of B matrices are written. Overall array dimension is {batch_count, K, N}
        adj_c (array3d): three-dimensional array to which the adjoints of C matrices are written. Overall array dimension is {batch_count, M, N}
        adj_d (array3d): three-dimensional array containing adjoints of D matrices. Overall array dimension is {batch_count, M, N}
        alpha (float): parameter alpha of GEMM
        beta (float): parameter beta of GEMM
        allow_tf32x3_arith (bool): whether to use CUTLASS's 3xTF32 GEMMs, which enable accuracy similar to FP32
                                   while using Tensor Cores
        device: device we want to use to multiply matrices. Defaults to active runtime device. If "cpu", resorts to using numpy multiplication.
    """
    from warp.context import runtime

    if device is None:
        device = runtime.get_device(device)

    if (
        a.device != device
        or b.device != device
        or c.device != device
        or adj_a.device != device
        or adj_b.device != device
        or adj_c.device != device
        or adj_d.device != device
    ):
        raise RuntimeError(
            "Matrices A, B, C, D, and their adjoints must all be on the same device as the runtime device."
        )

    if (
        a.dtype != b.dtype
        or a.dtype != c.dtype
        or a.dtype != adj_a.dtype
        or a.dtype != adj_b.dtype
        or a.dtype != adj_c.dtype
        or a.dtype != adj_d.dtype
    ):
        raise RuntimeError(
            "wp.adj_batched_matmul currently only supports operation between {A, B, C, adj_D, adj_A, adj_B, adj_C} matrices of the same type."
        )

    m = a.shape[1]
    n = b.shape[2]
    k = a.shape[2]
    batch_count = a.shape[0]
    if (
        b.shape != (batch_count, k, n)
        or c.shape != (batch_count, m, n)
        or adj_d.shape != (batch_count, m, n)
        or adj_a.shape != (batch_count, m, k)
        or adj_b.shape != (batch_count, k, n)
        or adj_c.shape != (batch_count, m, n)
    ):
        raise RuntimeError(
            "Invalid shapes for matrices: A = {} B = {} C = {} adj_D = {} adj_A = {} adj_B = {} adj_C = {}".format(
                a.shape, b.shape, c.shape, adj_d.shape, adj_a.shape, adj_b.shape, adj_c.shape
            )
        )

    if (
        (not a.is_contiguous and not a.is_transposed)
        or (not b.is_contiguous and not b.is_transposed)
        or (not c.is_contiguous)
        or (not adj_a.is_contiguous and not adj_a.is_transposed)
        or (not adj_b.is_contiguous and not adj_b.is_transposed)
        or (not adj_c.is_contiguous)
        or (not adj_d.is_contiguous)
    ):
        raise RuntimeError(
            "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B and their associated adjoints may be transposed."
        )

    # cpu fallback if no cuda devices found
    if device == "cpu":
        adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose((0, 2, 1))) + adj_a.numpy())
        adj_b.assign(alpha * np.matmul(a.numpy().transpose((0, 2, 1)), adj_d.numpy()) + adj_b.numpy())
        adj_c.assign(beta * adj_d.numpy() + adj_c.numpy())
        return

    # handle case in which batch_count exceeds max_batch_count, which is a CUDA array size maximum
    max_batch_count = 65535
    iters = int(batch_count / max_batch_count)
    remainder = batch_count % max_batch_count

    cc = device.arch

    for i in range(iters):
        idx_start = i * max_batch_count
        idx_end = (i + 1) * max_batch_count if i < iters - 1 else batch_count

        # adj_a
        if not a.is_transposed:
            ret = runtime.core.cutlass_gemm(
                cc,
                m,
                k,
                n,
                type_typestr(a.dtype).encode(),
                ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
                ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
                alpha,
                1.0,
                True,
                b.is_transposed,
                allow_tf32x3_arith,
                max_batch_count,
            )
            if not ret:
                raise RuntimeError("adj_matmul failed.")
        else:
            ret = runtime.core.cutlass_gemm(
                cc,
                k,
                m,
                n,
                type_typestr(a.dtype).encode(),
                ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
                ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
                alpha,
                1.0,
                not b.is_transposed,
                False,
                allow_tf32x3_arith,
                max_batch_count,
            )
            if not ret:
                raise RuntimeError("adj_matmul failed.")

        # adj_b
        if not b.is_transposed:
            ret = runtime.core.cutlass_gemm(
                cc,
                k,
                n,
                m,
                type_typestr(a.dtype).encode(),
                ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
                ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
                alpha,
                1.0,
                a.is_transposed,
                True,
                allow_tf32x3_arith,
                max_batch_count,
            )
            if not ret:
                raise RuntimeError("adj_matmul failed.")
        else:
            ret = runtime.core.cutlass_gemm(
                cc,
                n,
                k,
                m,
                type_typestr(a.dtype).encode(),
                ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
                ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
                alpha,
                1.0,
                False,
                not a.is_transposed,
                allow_tf32x3_arith,
                max_batch_count,
            )
            if not ret:
                raise RuntimeError("adj_matmul failed.")   
    
    idx_start = iters * max_batch_count
    
    # adj_a
    if not a.is_transposed:
        ret = runtime.core.cutlass_gemm(
            cc,
            m,
            k,
            n,
            type_typestr(a.dtype).encode(),
            ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
            ctypes.c_void_p(b[idx_start:,:,:].ptr),
            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
            alpha,
            1.0,
            True,
            b.is_transposed,
            allow_tf32x3_arith,
            remainder,
        )
        if not ret:
            raise RuntimeError("adj_matmul failed.")
    else:
        ret = runtime.core.cutlass_gemm(
            cc,
            k,
            m,
            n,
            type_typestr(a.dtype).encode(),
            ctypes.c_void_p(b[idx_start:,:,:].ptr),
            ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
            alpha,
            1.0,
            not b.is_transposed,
            False,
            allow_tf32x3_arith,
            remainder,
        )
        if not ret:
            raise RuntimeError("adj_matmul failed.")

    # adj_b
    if not b.is_transposed:
        ret = runtime.core.cutlass_gemm(
            cc,
            k,
            n,
            m,
            type_typestr(a.dtype).encode(),
            ctypes.c_void_p(a[idx_start:,:,:].ptr),
            ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
            alpha,
            1.0,
            a.is_transposed,
            True,
            allow_tf32x3_arith,
            remainder,
        )
        if not ret:
            raise RuntimeError("adj_matmul failed.")
    else:
        ret = runtime.core.cutlass_gemm(
            cc,
            n,
            k,
            m,
            type_typestr(a.dtype).encode(),
            ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
            ctypes.c_void_p(a[idx_start:,:,:].ptr),
            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
            alpha,
            1.0,
            False,
            not a.is_transposed,
            allow_tf32x3_arith,
            remainder,
        )
        if not ret:
            raise RuntimeError("adj_matmul failed.")   

    # adj_c
    warp.launch(
        kernel=warp.utils.add_kernel_3d,
        dim=adj_c.shape,
        inputs=[adj_c, adj_d, adj_d.dtype(beta)],
        device=device,
        record_tape=False
    )

class HashGrid:
    def __init__(self, dim_x, dim_y, dim_z, device=None):
        """Class representing a hash grid object for accelerated point queries.

        Attributes:
            id: Unique identifier for this mesh object, can be passed to kernels.
            device: Device this object lives on, all buffers must live on the same device.

        Args:
            dim_x (int): Number of cells in x-axis
            dim_y (int): Number of cells in y-axis
            dim_z (int): Number of cells in z-axis
        """

        from warp.context import runtime

        self.device = runtime.get_device(device)

        if self.device.is_cpu:
            self.id = runtime.core.hash_grid_create_host(dim_x, dim_y, dim_z)
        else:
            self.id = runtime.core.hash_grid_create_device(self.device.context, dim_x, dim_y, dim_z)

        # indicates whether the grid data has been reserved for use by a kernel
        self.reserved = False

    def build(self, points, radius):
        """Updates the hash grid data structure.

        This method rebuilds the underlying datastructure and should be called any time the set
        of points changes.

        Args:
            points (:class:`warp.array`): Array of points of type :class:`warp.vec3`
            radius (float): The cell size to use for bucketing points, cells are cubes with edges of this width.
                            For best performance the radius used to construct the grid should match closely to
                            the radius used when performing queries.
        """

        from warp.context import runtime

        if self.device.is_cpu:
            runtime.core.hash_grid_update_host(self.id, radius, ctypes.cast(points.ptr, ctypes.c_void_p), len(points))
        else:
            runtime.core.hash_grid_update_device(self.id, radius, ctypes.cast(points.ptr, ctypes.c_void_p), len(points))
        self.reserved = True

    def reserve(self, num_points):
        from warp.context import runtime

        if self.device.is_cpu:
            runtime.core.hash_grid_reserve_host(self.id, num_points)
        else:
            runtime.core.hash_grid_reserve_device(self.id, num_points)
        self.reserved = True

    def __del__(self):
        try:
            from warp.context import runtime

            if self.device.is_cpu:
                runtime.core.hash_grid_destroy_host(self.id)
            else:
                # use CUDA context guard to avoid side effects during garbage collection
                with self.device.context_guard:
                    runtime.core.hash_grid_destroy_device(self.id)

        except Exception:
            pass


class MarchingCubes:
    def __init__(self, nx: int, ny: int, nz: int, max_verts: int, max_tris: int, device=None):
        from warp.context import runtime

        self.device = runtime.get_device(device)

        if not self.device.is_cuda:
            raise RuntimeError("Only CUDA devices are supported for marching cubes")

        self.nx = nx
        self.ny = ny
        self.nz = nz

        self.max_verts = max_verts
        self.max_tris = max_tris

        # bindings to warp.so
        self.alloc = runtime.core.marching_cubes_create_device
        self.alloc.argtypes = [ctypes.c_void_p]
        self.alloc.restype = ctypes.c_uint64
        self.free = runtime.core.marching_cubes_destroy_device

        from warp.context import zeros

        self.verts = zeros(max_verts, dtype=vec3, device=self.device)
        self.indices = zeros(max_tris * 3, dtype=int, device=self.device)

        # alloc surfacer
        self.id = ctypes.c_uint64(self.alloc(self.device.context))

    def __del__(self):
        # use CUDA context guard to avoid side effects during garbage collection
        with self.device.context_guard:
            # destroy surfacer
            self.free(self.id)

    def resize(self, nx: int, ny: int, nz: int, max_verts: int, max_tris: int):
        # actual allocations will be resized on next call to surface()
        self.nx = nx
        self.ny = ny
        self.nz = nz
        self.max_verts = max_verts
        self.max_tris = max_tris

    def surface(self, field: array(dtype=float), threshold: float):
        from warp.context import runtime

        # WP_API int marching_cubes_surface_host(const float* field, int nx, int ny, int nz, float threshold, wp::vec3* verts, int* triangles, int max_verts, int max_tris, int* out_num_verts, int* out_num_tris);
        num_verts = ctypes.c_int(0)
        num_tris = ctypes.c_int(0)

        runtime.core.marching_cubes_surface_device.restype = ctypes.c_int

        error = runtime.core.marching_cubes_surface_device(
            self.id,
            ctypes.cast(field.ptr, ctypes.c_void_p),
            self.nx,
            self.ny,
            self.nz,
            ctypes.c_float(threshold),
            ctypes.cast(self.verts.ptr, ctypes.c_void_p),
            ctypes.cast(self.indices.ptr, ctypes.c_void_p),
            self.max_verts,
            self.max_tris,
            ctypes.c_void_p(ctypes.addressof(num_verts)),
            ctypes.c_void_p(ctypes.addressof(num_tris)),
        )

        if error:
            raise RuntimeError(
                "Buffers may not be large enough, marching cubes required at least {num_verts} vertices, and {num_tris} triangles."
            )

        # resize the geometry arrays
        self.verts.shape = (num_verts.value,)
        self.indices.shape = (num_tris.value * 3,)

        self.verts.size = num_verts.value
        self.indices.size = num_tris.value * 3


def type_is_generic(t):
    if t in (Any, Scalar, Float, Int):
        return True
    elif is_array(t):
        return type_is_generic(t.dtype)
    elif hasattr(t, "_wp_scalar_type_"):
        # vector/matrix type, check if dtype is generic
        if type_is_generic(t._wp_scalar_type_):
            return True
        # check if any dimension is generic
        for d in t._shape_:
            if d == 0:
                return True
    else:
        return False


def type_is_generic_scalar(t):
    return t in (Scalar, Float, Int)


def type_matches_template(arg_type, template_type):
    """Check if an argument type matches a template.

    This function is used to test whether the arguments passed to a generic @wp.kernel or @wp.func
    match the template type annotations.  The template_type can be generic, but the arg_type must be concrete.
    """

    # canonicalize types
    arg_type = type_to_warp(arg_type)
    template_type = type_to_warp(template_type)

    # arg type must be concrete
    if type_is_generic(arg_type):
        return False

    # if template type is not generic, the argument type must match exactly
    if not type_is_generic(template_type):
        return types_equal(arg_type, template_type)

    # template type is generic, check that the argument type matches
    if template_type == Any:
        return True
    elif is_array(template_type):
        # ensure the argument type is a non-generic array with matching dtype and dimensionality
        if type(arg_type) is not type(template_type):
            return False
        if not type_matches_template(arg_type.dtype, template_type.dtype):
            return False
        if arg_type.ndim != template_type.ndim:
            return False
    elif template_type == Float:
        return arg_type in float_types
    elif template_type == Int:
        return arg_type in int_types
    elif template_type == Scalar:
        return arg_type in scalar_types
    elif hasattr(template_type, "_wp_scalar_type_"):
        # vector/matrix type
        if not hasattr(arg_type, "_wp_scalar_type_"):
            return False
        if not type_matches_template(arg_type._wp_scalar_type_, template_type._wp_scalar_type_):
            return False
        ndim = len(template_type._shape_)
        if len(arg_type._shape_) != ndim:
            return False
        # for any non-generic dimensions, make sure they match
        for i in range(ndim):
            if template_type._shape_[i] != 0 and arg_type._shape_[i] != template_type._shape_[i]:
                return False

    return True


def infer_argument_types(args, template_types, arg_names=None):
    """Resolve argument types with the given list of template types."""

    if len(args) != len(template_types):
        raise RuntimeError("Number of arguments must match number of template types.")

    arg_types = []

    for i in range(len(args)):
        arg = args[i]
        arg_type = type(arg)
        arg_name = arg_names[i] if arg_names else str(i)
        if arg_type in warp.types.array_types:
            arg_types.append(arg_type(dtype=arg.dtype, ndim=arg.ndim))
        elif arg_type in warp.types.scalar_types:
            arg_types.append(arg_type)
        elif arg_type in [int, float]:
            # canonicalize type
            arg_types.append(warp.types.type_to_warp(arg_type))
        elif hasattr(arg_type, "_wp_scalar_type_"):
            # vector/matrix type
            arg_types.append(arg_type)
        elif issubclass(arg_type, warp.codegen.StructInstance):
            # a struct
            arg_types.append(arg._cls)
        # elif arg_type in [warp.types.launch_bounds_t, warp.types.shape_t, warp.types.range_t]:
        #     arg_types.append(arg_type)
        # elif arg_type in [warp.hash_grid_query_t, warp.mesh_query_aabb_t, warp.mesh_query_point_t, warp.mesh_query_ray_t, warp.bvh_query_t]:
        #     arg_types.append(arg_type)
        elif arg is None:
            # allow passing None for arrays
            t = template_types[i]
            if warp.types.is_array(t):
                arg_types.append(type(t)(dtype=t.dtype, ndim=t.ndim))
            else:
                raise TypeError(f"Unable to infer the type of argument '{arg_name}', got None")
        else:
            # TODO: attempt to figure out if it's a vector/matrix type given as a numpy array, list, etc.
            raise TypeError(f"Unable to infer the type of argument '{arg_name}', got {arg_type}")

    return arg_types


simple_type_codes = {
    int: "i4",
    float: "f4",
    builtins.bool: "b",
    bool: "b",
    str: "str",  # accepted by print()
    int8: "i1",
    int16: "i2",
    int32: "i4",
    int64: "i8",
    uint8: "u1",
    uint16: "u2",
    uint32: "u4",
    uint64: "u8",
    float16: "f2",
    float32: "f4",
    float64: "f8",
    shape_t: "sh",
    range_t: "rg",
    launch_bounds_t: "lb",
    hash_grid_query_t: "hgq",
    mesh_query_aabb_t: "mqa",
    mesh_query_point_t: "mqp",
    mesh_query_ray_t: "mqr",
    bvh_query_t: "bvhq",
}


def get_type_code(arg_type):
    if arg_type == Any:
        # special case for generics
        # note: since Python 3.11 Any is a type, so we check for it first
        return "?"
    elif isinstance(arg_type, type):
        if hasattr(arg_type, "_wp_scalar_type_"):
            # vector/matrix type
            dtype_code = get_type_code(arg_type._wp_scalar_type_)
            # check for "special" vector/matrix subtypes
            if hasattr(arg_type, "_wp_generic_type_str_"):
                type_str = arg_type._wp_generic_type_str_
                if type_str == "quat_t":
                    return f"q{dtype_code}"
                elif type_str == "transform_t":
                    return f"t{dtype_code}"
                # elif type_str == "spatial_vector_t":
                #     return f"sv{dtype_code}"
                # elif type_str == "spatial_matrix_t":
                #     return f"sm{dtype_code}"
            # generic vector/matrix
            ndim = len(arg_type._shape_)
            if ndim == 1:
                dim_code = "?" if arg_type._shape_[0] == 0 else str(arg_type._shape_[0])
                return f"v{dim_code}{dtype_code}"
            elif ndim == 2:
                dim_code0 = "?" if arg_type._shape_[0] == 0 else str(arg_type._shape_[0])
                dim_code1 = "?" if arg_type._shape_[1] == 0 else str(arg_type._shape_[1])
                return f"m{dim_code0}{dim_code1}{dtype_code}"
            else:
                raise TypeError("Invalid vector/matrix dimensionality")
        else:
            # simple type
            type_code = simple_type_codes.get(arg_type)
            if type_code is not None:
                return type_code
            else:
                raise TypeError(f"Unrecognized type '{arg_type}'")
    elif isinstance(arg_type, array):
        return f"a{arg_type.ndim}{get_type_code(arg_type.dtype)}"
    elif isinstance(arg_type, indexedarray):
        return f"ia{arg_type.ndim}{get_type_code(arg_type.dtype)}"
    elif isinstance(arg_type, fabricarray):
        return f"fa{arg_type.ndim}{get_type_code(arg_type.dtype)}"
    elif isinstance(arg_type, indexedfabricarray):
        return f"ifa{arg_type.ndim}{get_type_code(arg_type.dtype)}"
    elif isinstance(arg_type, warp.codegen.Struct):
        return warp.codegen.make_full_qualified_name(arg_type.cls)
    elif arg_type == Scalar:
        # generic scalar type
        return "s?"
    elif arg_type == Float:
        # generic float
        return "f?"
    elif arg_type == Int:
        # generic int
        return "i?"
    elif isinstance(arg_type, Callable):
        # TODO: elaborate on Callable type?
        return "c"
    else:
        raise TypeError(f"Unrecognized type '{arg_type}'")


def get_signature(arg_types, func_name=None, arg_names=None):
    type_codes = []
    for i, arg_type in enumerate(arg_types):
        try:
            type_codes.append(get_type_code(arg_type))
        except Exception as e:
            if arg_names is not None:
                arg_str = f"'{arg_names[i]}'"
            else:
                arg_str = str(i + 1)
            if func_name is not None:
                func_str = f" of function {func_name}"
            else:
                func_str = ""
            raise RuntimeError(f"Failed to determine type code for argument {arg_str}{func_str}: {e}")

    return "_".join(type_codes)


def is_generic_signature(sig):
    return "?" in sig