# Copyright (c) 2023 NVIDIA CORPORATION.  All rights reserved.
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

import warp
import ctypes

from warp.thirdparty.dlpack import (
    DLManagedTensor,
    DLDevice,
    DLDeviceType,
    DLDataType,
    DLDataTypeCode,
    DLTensor,
    _c_str_dltensor,
)

ctypes.pythonapi.PyMem_RawMalloc.restype = ctypes.c_void_p
ctypes.pythonapi.PyMem_RawFree.argtypes = [ctypes.c_void_p]

ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object
ctypes.pythonapi.PyCapsule_New.argtypes = [ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p]

ctypes.pythonapi.PyCapsule_IsValid.restype = ctypes.c_int
ctypes.pythonapi.PyCapsule_IsValid.argtypes = [ctypes.py_object, ctypes.c_char_p]

ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
ctypes.pythonapi.PyCapsule_GetPointer.argtypes = [ctypes.py_object, ctypes.c_char_p]


class _Holder:
    def __init__(self, wp_array) -> None:
        self.wp_array = wp_array

    def _as_manager_ctx(self) -> ctypes.c_void_p:
        py_obj = ctypes.py_object(self)
        py_obj_ptr = ctypes.pointer(py_obj)
        ctypes.pythonapi.Py_IncRef(py_obj)
        ctypes.pythonapi.Py_IncRef(ctypes.py_object(py_obj_ptr))
        return ctypes.cast(py_obj_ptr, ctypes.c_void_p)


@ctypes.CFUNCTYPE(None, ctypes.c_void_p)
def _warp_array_deleter(handle: ctypes.c_void_p) -> None:
    """A function to deallocate the memory of a Warp array."""

    dl_managed_tensor = DLManagedTensor.from_address(handle)
    py_obj_ptr = ctypes.cast(dl_managed_tensor.manager_ctx, ctypes.POINTER(ctypes.py_object))
    py_obj = py_obj_ptr.contents
    ctypes.pythonapi.Py_DecRef(py_obj)
    ctypes.pythonapi.Py_DecRef(ctypes.py_object(py_obj_ptr))
    ctypes.pythonapi.PyMem_RawFree(handle)


@ctypes.CFUNCTYPE(None, ctypes.c_void_p)
def _warp_pycapsule_deleter(handle: ctypes.c_void_p) -> None:
    """A function to deallocate a pycapsule that wraps a Warp array."""

    pycapsule: ctypes.py_object = ctypes.cast(handle, ctypes.py_object)
    if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, _c_str_dltensor):
        dl_managed_tensor = ctypes.pythonapi.PyCapsule_GetPointer(pycapsule, _c_str_dltensor)
        _warp_array_deleter(dl_managed_tensor)
        ctypes.pythonapi.PyCapsule_SetDestructor(pycapsule, None)


def device_to_dlpack(wp_device) -> DLDevice:
    d = warp.get_device(wp_device)

    if d.is_cpu:
        device_type = DLDeviceType.kDLCPU
        device_id = 0
    elif d.is_cuda:
        device_type = DLDeviceType.kDLCUDA
        device_id = d.ordinal
    else:
        raise RuntimeError("Unhandled device type converting to dlpack")

    dl_device = DLDevice()
    dl_device.device_type = device_type
    dl_device.device_id = device_id

    return dl_device


def dtype_to_dlpack(wp_dtype) -> DLDataType:
    if wp_dtype == warp.int8:
        return (DLDataTypeCode.kDLInt, 8, 1)
    elif wp_dtype == warp.uint8:
        return (DLDataTypeCode.kDLUInt, 8, 1)
    elif wp_dtype == warp.int16:
        return (DLDataTypeCode.kDLInt, 16, 1)
    elif wp_dtype == warp.uint16:
        return (DLDataTypeCode.kDLUInt, 16, 1)
    elif wp_dtype == warp.int32:
        return (DLDataTypeCode.kDLInt, 32, 1)
    elif wp_dtype == warp.uint32:
        return (DLDataTypeCode.kDLUInt, 32, 1)
    elif wp_dtype == warp.int64:
        return (DLDataTypeCode.kDLInt, 64, 1)
    elif wp_dtype == warp.uint64:
        return (DLDataTypeCode.kDLUInt, 64, 1)
    elif wp_dtype == warp.float16:
        return (DLDataTypeCode.kDLFloat, 16, 1)
    elif wp_dtype == warp.float32:
        return (DLDataTypeCode.kDLFloat, 32, 1)
    elif wp_dtype == warp.float64:
        return (DLDataTypeCode.kDLFloat, 64, 1)
    else:
        raise RuntimeError(f"No conversion from Warp type {wp_dtype} to DLPack type")


def dtype_from_dlpack(dl_dtype):
    # unpack to tuple for easier comparison
    dl_dtype = (dl_dtype.type_code.value, dl_dtype.bits)

    if dl_dtype == (DLDataTypeCode.kDLUInt, 1):
        raise RuntimeError("Warp does not support bit boolean types")
    elif dl_dtype == (DLDataTypeCode.kDLInt, 8):
        return warp.types.int8
    elif dl_dtype == (DLDataTypeCode.kDLInt, 16):
        return warp.types.int16
    elif dl_dtype == (DLDataTypeCode.kDLInt, 32):
        return warp.types.int32
    elif dl_dtype == (DLDataTypeCode.kDLInt, 64):
        return warp.types.int64
    elif dl_dtype == (DLDataTypeCode.kDLUInt, 8):
        return warp.types.uint8
    elif dl_dtype == (DLDataTypeCode.kDLUInt, 16):
        return warp.types.uint16
    elif dl_dtype == (DLDataTypeCode.kDLUInt, 32):
        return warp.types.uint32
    elif dl_dtype == (DLDataTypeCode.kDLUInt, 64):
        return warp.types.uint64
    elif dl_dtype == (DLDataTypeCode.kDLFloat, 16):
        return warp.types.float16
    elif dl_dtype == (DLDataTypeCode.kDLFloat, 32):
        return warp.types.float32
    elif dl_dtype == (DLDataTypeCode.kDLFloat, 64):
        return warp.types.float64
    elif dl_dtype == (DLDataTypeCode.kDLComplex, 64):
        raise RuntimeError("Warp does not support complex types")
    elif dl_dtype == (DLDataTypeCode.kDLComplex, 128):
        raise RuntimeError("Warp does not support complex types")
    else:
        raise RuntimeError(f"Unknown dlpack datatype {dl_dtype}")


def device_from_dlpack(dl_device):
    if dl_device.device_type.value == DLDeviceType.kDLCPU or dl_device.device_type.value == DLDeviceType.kDLCUDAHost:
        return "cpu"
    elif (
        dl_device.device_type.value == DLDeviceType.kDLCUDA
        or dl_device.device_type.value == DLDeviceType.kDLCUDAManaged
    ):
        return f"cuda:{dl_device.device_id}"
    else:
        raise RuntimeError(f"Unknown device type from dlpack: {dl_device.device_type.value}")


def shape_to_dlpack(shape):
    a = (ctypes.c_int64 * len(shape))(*shape)
    return a


def strides_to_dlpack(strides, dtype):
    # convert from byte count to element count
    s = []
    for i in range(len(strides)):
        s.append(int(int(strides[i]) / int(warp.types.type_size_in_bytes(dtype))))

    a = (ctypes.c_int64 * len(strides))(*s)
    return a


def to_dlpack(wp_array: warp.array):
    """Convert a Warp array to another type of dlpack compatible array.

    Parameters
    ----------
    np_array : np.ndarray
        The source numpy array that will be converted.

    Returns
    -------
    pycapsule : PyCapsule
        A pycapsule containing a DLManagedTensor that can be converted
        to other array formats without copying the underlying memory.
    """

    # DLPack does not support structured arrays
    if isinstance(wp_array.dtype, warp.codegen.Struct):
        raise RuntimeError("Cannot convert structured Warp arrays to DLPack.")

    holder = _Holder(wp_array)

    # allocate DLManagedTensor
    size = ctypes.c_size_t(ctypes.sizeof(DLManagedTensor))
    dl_managed_tensor = DLManagedTensor.from_address(ctypes.pythonapi.PyMem_RawMalloc(size))

    # handle vector types
    if hasattr(wp_array.dtype, "_wp_scalar_type_"):
        # vector type, flatten the dimensions into one tuple
        target_dtype = wp_array.dtype._wp_scalar_type_
        target_ndim = wp_array.ndim + len(wp_array.dtype._shape_)
        target_shape = (*wp_array.shape, *wp_array.dtype._shape_)
        dtype_strides = warp.types.strides_from_shape(wp_array.dtype._shape_, wp_array.dtype._wp_scalar_type_)
        target_strides = (*wp_array.strides, *dtype_strides)
    else:
        # scalar type
        target_dtype = wp_array.dtype
        target_ndim = wp_array.ndim
        target_shape = wp_array.shape
        target_strides = wp_array.strides

    # store the shape and stride arrays with the holder to prevent them from getting deallocated
    holder._shape = shape_to_dlpack(target_shape)
    holder._strides = strides_to_dlpack(target_strides, target_dtype)

    if wp_array.pinned:
        dl_device = DLDeviceType.kDLCUDAHost
    else:
        dl_device = device_to_dlpack(wp_array.device)

    # set Tensor attributes
    dl_managed_tensor.dl_tensor.data = wp_array.ptr
    dl_managed_tensor.dl_tensor.device = dl_device
    dl_managed_tensor.dl_tensor.ndim = target_ndim
    dl_managed_tensor.dl_tensor.dtype = dtype_to_dlpack(target_dtype)
    dl_managed_tensor.dl_tensor.shape = holder._shape
    dl_managed_tensor.dl_tensor.strides = holder._strides
    dl_managed_tensor.dl_tensor.byte_offset = 0
    dl_managed_tensor.manager_ctx = holder._as_manager_ctx()
    dl_managed_tensor.deleter = _warp_array_deleter

    pycapsule = ctypes.pythonapi.PyCapsule_New(
        ctypes.byref(dl_managed_tensor),
        _c_str_dltensor,
        _warp_pycapsule_deleter,
    )

    return pycapsule


def dtype_is_compatible(dl_dtype, wp_dtype):
    if dl_dtype.bits % 8 != 0:
        raise RuntimeError("Data types with less than 8 bits are not supported")

    if dl_dtype.type_code.value == DLDataTypeCode.kDLFloat:
        if dl_dtype.bits == 16:
            return wp_dtype == warp.float16
        elif dl_dtype.bits == 32:
            return wp_dtype == warp.float32
        elif dl_dtype.bits == 64:
            return wp_dtype == warp.float64
    elif dl_dtype.type_code.value == DLDataTypeCode.kDLInt or dl_dtype.type_code.value == DLDataTypeCode.kDLUInt:
        if dl_dtype.bits == 8:
            return wp_dtype == warp.int8 or wp_dtype == warp.uint8
        elif dl_dtype.bits == 16:
            return wp_dtype == warp.int16 or wp_dtype == warp.uint16
        elif dl_dtype.bits == 32:
            return wp_dtype == warp.int32 or wp_dtype == warp.uint32
        elif dl_dtype.bits == 64:
            return wp_dtype == warp.int64 or wp_dtype == warp.uint64
    elif dl_dtype.type_code.value == DLDataTypeCode.kDLBfloat:
        raise RuntimeError("Bfloat data type is not supported")
    elif dl_dtype.type_code.value == DLDataTypeCode.kDLComplex:
        raise RuntimeError("Complex data types are not supported")
    else:
        raise RuntimeError(f"Unsupported dlpack dtype {(str(dl_dtype.type_code), dl_dtype.bits)}")


def from_dlpack(pycapsule, dtype=None) -> warp.array:
    """Convert a dlpack tensor into a numpy array without copying.

    Parameters
    ----------
    pycapsule : PyCapsule
        A pycapsule wrapping a dlpack tensor that will be converted.

    Returns
    -------
    np_array : np.ndarray
        A new numpy array that uses the same underlying memory as the input
        pycapsule.
    """

    assert ctypes.pythonapi.PyCapsule_IsValid(pycapsule, _c_str_dltensor)
    dl_managed_tensor = ctypes.pythonapi.PyCapsule_GetPointer(pycapsule, _c_str_dltensor)
    dl_managed_tensor_ptr = ctypes.cast(dl_managed_tensor, ctypes.POINTER(DLManagedTensor))
    dl_managed_tensor = dl_managed_tensor_ptr.contents

    dlt = dl_managed_tensor.dl_tensor
    assert isinstance(dlt, DLTensor)

    device = device_from_dlpack(dlt.device)

    pinned = dlt.device.device_type.value == DLDeviceType.kDLCUDAHost

    shape = tuple(dlt.shape[dim] for dim in range(dlt.ndim))

    itemsize = dlt.dtype.bits // 8
    if dlt.strides:
        strides = tuple(dlt.strides[dim] * itemsize for dim in range(dlt.ndim))
    else:
        strides = None

    # handle multi-lane dtypes as another dimension
    if dlt.dtype.lanes > 1:
        shape = (*shape, dlt.dtype.lanes)
        if strides is not None:
            strides = (*strides, itemsize)

    if dtype is None:
        # automatically detect dtype
        dtype = dtype_from_dlpack(dlt.dtype)

    elif hasattr(dtype, "_wp_scalar_type_"):
        # handle vector/matrix types

        if not dtype_is_compatible(dlt.dtype, dtype._wp_scalar_type_):
            raise RuntimeError(f"Incompatible data types: {dlt.dtype} and {dtype}")

        dtype_shape = dtype._shape_
        dtype_dims = len(dtype._shape_)
        if dtype_dims > len(shape) or dtype_shape != shape[-dtype_dims:]:
            raise RuntimeError(
                f"Could not convert DLPack tensor with shape {shape} to Warp array with dtype={dtype}, ensure that source inner shape is {dtype_shape}"
            )

        if strides is not None:
            # ensure the inner strides are contiguous
            stride = itemsize
            for i in range(dtype_dims):
                if strides[-i - 1] != stride:
                    raise RuntimeError(
                        f"Could not convert DLPack tensor with shape {shape} to Warp array with dtype={dtype}, because the source inner strides are not contiguous"
                    )
                stride *= dtype_shape[-i - 1]
            strides = tuple(strides[:-dtype_dims]) or (itemsize,)

        shape = tuple(shape[:-dtype_dims]) or (1,)

    elif not dtype_is_compatible(dlt.dtype, dtype):
        # incompatible dtype requested
        raise RuntimeError(f"Incompatible data types: {dlt.dtype} and {dtype}")

    a = warp.types.array(
        ptr=dlt.data, dtype=dtype, shape=shape, strides=strides, copy=False, owner=False, device=device, pinned=pinned
    )

    # keep a reference to the capsule so it doesn't get deleted
    a._pycapsule = pycapsule

    return a