qbhf2's picture
added NvidiaWarp and GarmentCode repos
66c9c8a
# Copyright (c) 2023 NVIDIA CORPORATION. All rights reserved.
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto. Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
import warp
import ctypes
from warp.thirdparty.dlpack import (
DLManagedTensor,
DLDevice,
DLDeviceType,
DLDataType,
DLDataTypeCode,
DLTensor,
_c_str_dltensor,
)
ctypes.pythonapi.PyMem_RawMalloc.restype = ctypes.c_void_p
ctypes.pythonapi.PyMem_RawFree.argtypes = [ctypes.c_void_p]
ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object
ctypes.pythonapi.PyCapsule_New.argtypes = [ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p]
ctypes.pythonapi.PyCapsule_IsValid.restype = ctypes.c_int
ctypes.pythonapi.PyCapsule_IsValid.argtypes = [ctypes.py_object, ctypes.c_char_p]
ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
ctypes.pythonapi.PyCapsule_GetPointer.argtypes = [ctypes.py_object, ctypes.c_char_p]
class _Holder:
def __init__(self, wp_array) -> None:
self.wp_array = wp_array
def _as_manager_ctx(self) -> ctypes.c_void_p:
py_obj = ctypes.py_object(self)
py_obj_ptr = ctypes.pointer(py_obj)
ctypes.pythonapi.Py_IncRef(py_obj)
ctypes.pythonapi.Py_IncRef(ctypes.py_object(py_obj_ptr))
return ctypes.cast(py_obj_ptr, ctypes.c_void_p)
@ctypes.CFUNCTYPE(None, ctypes.c_void_p)
def _warp_array_deleter(handle: ctypes.c_void_p) -> None:
"""A function to deallocate the memory of a Warp array."""
dl_managed_tensor = DLManagedTensor.from_address(handle)
py_obj_ptr = ctypes.cast(dl_managed_tensor.manager_ctx, ctypes.POINTER(ctypes.py_object))
py_obj = py_obj_ptr.contents
ctypes.pythonapi.Py_DecRef(py_obj)
ctypes.pythonapi.Py_DecRef(ctypes.py_object(py_obj_ptr))
ctypes.pythonapi.PyMem_RawFree(handle)
@ctypes.CFUNCTYPE(None, ctypes.c_void_p)
def _warp_pycapsule_deleter(handle: ctypes.c_void_p) -> None:
"""A function to deallocate a pycapsule that wraps a Warp array."""
pycapsule: ctypes.py_object = ctypes.cast(handle, ctypes.py_object)
if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, _c_str_dltensor):
dl_managed_tensor = ctypes.pythonapi.PyCapsule_GetPointer(pycapsule, _c_str_dltensor)
_warp_array_deleter(dl_managed_tensor)
ctypes.pythonapi.PyCapsule_SetDestructor(pycapsule, None)
def device_to_dlpack(wp_device) -> DLDevice:
d = warp.get_device(wp_device)
if d.is_cpu:
device_type = DLDeviceType.kDLCPU
device_id = 0
elif d.is_cuda:
device_type = DLDeviceType.kDLCUDA
device_id = d.ordinal
else:
raise RuntimeError("Unhandled device type converting to dlpack")
dl_device = DLDevice()
dl_device.device_type = device_type
dl_device.device_id = device_id
return dl_device
def dtype_to_dlpack(wp_dtype) -> DLDataType:
if wp_dtype == warp.int8:
return (DLDataTypeCode.kDLInt, 8, 1)
elif wp_dtype == warp.uint8:
return (DLDataTypeCode.kDLUInt, 8, 1)
elif wp_dtype == warp.int16:
return (DLDataTypeCode.kDLInt, 16, 1)
elif wp_dtype == warp.uint16:
return (DLDataTypeCode.kDLUInt, 16, 1)
elif wp_dtype == warp.int32:
return (DLDataTypeCode.kDLInt, 32, 1)
elif wp_dtype == warp.uint32:
return (DLDataTypeCode.kDLUInt, 32, 1)
elif wp_dtype == warp.int64:
return (DLDataTypeCode.kDLInt, 64, 1)
elif wp_dtype == warp.uint64:
return (DLDataTypeCode.kDLUInt, 64, 1)
elif wp_dtype == warp.float16:
return (DLDataTypeCode.kDLFloat, 16, 1)
elif wp_dtype == warp.float32:
return (DLDataTypeCode.kDLFloat, 32, 1)
elif wp_dtype == warp.float64:
return (DLDataTypeCode.kDLFloat, 64, 1)
else:
raise RuntimeError(f"No conversion from Warp type {wp_dtype} to DLPack type")
def dtype_from_dlpack(dl_dtype):
# unpack to tuple for easier comparison
dl_dtype = (dl_dtype.type_code.value, dl_dtype.bits)
if dl_dtype == (DLDataTypeCode.kDLUInt, 1):
raise RuntimeError("Warp does not support bit boolean types")
elif dl_dtype == (DLDataTypeCode.kDLInt, 8):
return warp.types.int8
elif dl_dtype == (DLDataTypeCode.kDLInt, 16):
return warp.types.int16
elif dl_dtype == (DLDataTypeCode.kDLInt, 32):
return warp.types.int32
elif dl_dtype == (DLDataTypeCode.kDLInt, 64):
return warp.types.int64
elif dl_dtype == (DLDataTypeCode.kDLUInt, 8):
return warp.types.uint8
elif dl_dtype == (DLDataTypeCode.kDLUInt, 16):
return warp.types.uint16
elif dl_dtype == (DLDataTypeCode.kDLUInt, 32):
return warp.types.uint32
elif dl_dtype == (DLDataTypeCode.kDLUInt, 64):
return warp.types.uint64
elif dl_dtype == (DLDataTypeCode.kDLFloat, 16):
return warp.types.float16
elif dl_dtype == (DLDataTypeCode.kDLFloat, 32):
return warp.types.float32
elif dl_dtype == (DLDataTypeCode.kDLFloat, 64):
return warp.types.float64
elif dl_dtype == (DLDataTypeCode.kDLComplex, 64):
raise RuntimeError("Warp does not support complex types")
elif dl_dtype == (DLDataTypeCode.kDLComplex, 128):
raise RuntimeError("Warp does not support complex types")
else:
raise RuntimeError(f"Unknown dlpack datatype {dl_dtype}")
def device_from_dlpack(dl_device):
if dl_device.device_type.value == DLDeviceType.kDLCPU or dl_device.device_type.value == DLDeviceType.kDLCUDAHost:
return "cpu"
elif (
dl_device.device_type.value == DLDeviceType.kDLCUDA
or dl_device.device_type.value == DLDeviceType.kDLCUDAManaged
):
return f"cuda:{dl_device.device_id}"
else:
raise RuntimeError(f"Unknown device type from dlpack: {dl_device.device_type.value}")
def shape_to_dlpack(shape):
a = (ctypes.c_int64 * len(shape))(*shape)
return a
def strides_to_dlpack(strides, dtype):
# convert from byte count to element count
s = []
for i in range(len(strides)):
s.append(int(int(strides[i]) / int(warp.types.type_size_in_bytes(dtype))))
a = (ctypes.c_int64 * len(strides))(*s)
return a
def to_dlpack(wp_array: warp.array):
"""Convert a Warp array to another type of dlpack compatible array.
Parameters
----------
np_array : np.ndarray
The source numpy array that will be converted.
Returns
-------
pycapsule : PyCapsule
A pycapsule containing a DLManagedTensor that can be converted
to other array formats without copying the underlying memory.
"""
# DLPack does not support structured arrays
if isinstance(wp_array.dtype, warp.codegen.Struct):
raise RuntimeError("Cannot convert structured Warp arrays to DLPack.")
holder = _Holder(wp_array)
# allocate DLManagedTensor
size = ctypes.c_size_t(ctypes.sizeof(DLManagedTensor))
dl_managed_tensor = DLManagedTensor.from_address(ctypes.pythonapi.PyMem_RawMalloc(size))
# handle vector types
if hasattr(wp_array.dtype, "_wp_scalar_type_"):
# vector type, flatten the dimensions into one tuple
target_dtype = wp_array.dtype._wp_scalar_type_
target_ndim = wp_array.ndim + len(wp_array.dtype._shape_)
target_shape = (*wp_array.shape, *wp_array.dtype._shape_)
dtype_strides = warp.types.strides_from_shape(wp_array.dtype._shape_, wp_array.dtype._wp_scalar_type_)
target_strides = (*wp_array.strides, *dtype_strides)
else:
# scalar type
target_dtype = wp_array.dtype
target_ndim = wp_array.ndim
target_shape = wp_array.shape
target_strides = wp_array.strides
# store the shape and stride arrays with the holder to prevent them from getting deallocated
holder._shape = shape_to_dlpack(target_shape)
holder._strides = strides_to_dlpack(target_strides, target_dtype)
if wp_array.pinned:
dl_device = DLDeviceType.kDLCUDAHost
else:
dl_device = device_to_dlpack(wp_array.device)
# set Tensor attributes
dl_managed_tensor.dl_tensor.data = wp_array.ptr
dl_managed_tensor.dl_tensor.device = dl_device
dl_managed_tensor.dl_tensor.ndim = target_ndim
dl_managed_tensor.dl_tensor.dtype = dtype_to_dlpack(target_dtype)
dl_managed_tensor.dl_tensor.shape = holder._shape
dl_managed_tensor.dl_tensor.strides = holder._strides
dl_managed_tensor.dl_tensor.byte_offset = 0
dl_managed_tensor.manager_ctx = holder._as_manager_ctx()
dl_managed_tensor.deleter = _warp_array_deleter
pycapsule = ctypes.pythonapi.PyCapsule_New(
ctypes.byref(dl_managed_tensor),
_c_str_dltensor,
_warp_pycapsule_deleter,
)
return pycapsule
def dtype_is_compatible(dl_dtype, wp_dtype):
if dl_dtype.bits % 8 != 0:
raise RuntimeError("Data types with less than 8 bits are not supported")
if dl_dtype.type_code.value == DLDataTypeCode.kDLFloat:
if dl_dtype.bits == 16:
return wp_dtype == warp.float16
elif dl_dtype.bits == 32:
return wp_dtype == warp.float32
elif dl_dtype.bits == 64:
return wp_dtype == warp.float64
elif dl_dtype.type_code.value == DLDataTypeCode.kDLInt or dl_dtype.type_code.value == DLDataTypeCode.kDLUInt:
if dl_dtype.bits == 8:
return wp_dtype == warp.int8 or wp_dtype == warp.uint8
elif dl_dtype.bits == 16:
return wp_dtype == warp.int16 or wp_dtype == warp.uint16
elif dl_dtype.bits == 32:
return wp_dtype == warp.int32 or wp_dtype == warp.uint32
elif dl_dtype.bits == 64:
return wp_dtype == warp.int64 or wp_dtype == warp.uint64
elif dl_dtype.type_code.value == DLDataTypeCode.kDLBfloat:
raise RuntimeError("Bfloat data type is not supported")
elif dl_dtype.type_code.value == DLDataTypeCode.kDLComplex:
raise RuntimeError("Complex data types are not supported")
else:
raise RuntimeError(f"Unsupported dlpack dtype {(str(dl_dtype.type_code), dl_dtype.bits)}")
def from_dlpack(pycapsule, dtype=None) -> warp.array:
"""Convert a dlpack tensor into a numpy array without copying.
Parameters
----------
pycapsule : PyCapsule
A pycapsule wrapping a dlpack tensor that will be converted.
Returns
-------
np_array : np.ndarray
A new numpy array that uses the same underlying memory as the input
pycapsule.
"""
assert ctypes.pythonapi.PyCapsule_IsValid(pycapsule, _c_str_dltensor)
dl_managed_tensor = ctypes.pythonapi.PyCapsule_GetPointer(pycapsule, _c_str_dltensor)
dl_managed_tensor_ptr = ctypes.cast(dl_managed_tensor, ctypes.POINTER(DLManagedTensor))
dl_managed_tensor = dl_managed_tensor_ptr.contents
dlt = dl_managed_tensor.dl_tensor
assert isinstance(dlt, DLTensor)
device = device_from_dlpack(dlt.device)
pinned = dlt.device.device_type.value == DLDeviceType.kDLCUDAHost
shape = tuple(dlt.shape[dim] for dim in range(dlt.ndim))
itemsize = dlt.dtype.bits // 8
if dlt.strides:
strides = tuple(dlt.strides[dim] * itemsize for dim in range(dlt.ndim))
else:
strides = None
# handle multi-lane dtypes as another dimension
if dlt.dtype.lanes > 1:
shape = (*shape, dlt.dtype.lanes)
if strides is not None:
strides = (*strides, itemsize)
if dtype is None:
# automatically detect dtype
dtype = dtype_from_dlpack(dlt.dtype)
elif hasattr(dtype, "_wp_scalar_type_"):
# handle vector/matrix types
if not dtype_is_compatible(dlt.dtype, dtype._wp_scalar_type_):
raise RuntimeError(f"Incompatible data types: {dlt.dtype} and {dtype}")
dtype_shape = dtype._shape_
dtype_dims = len(dtype._shape_)
if dtype_dims > len(shape) or dtype_shape != shape[-dtype_dims:]:
raise RuntimeError(
f"Could not convert DLPack tensor with shape {shape} to Warp array with dtype={dtype}, ensure that source inner shape is {dtype_shape}"
)
if strides is not None:
# ensure the inner strides are contiguous
stride = itemsize
for i in range(dtype_dims):
if strides[-i - 1] != stride:
raise RuntimeError(
f"Could not convert DLPack tensor with shape {shape} to Warp array with dtype={dtype}, because the source inner strides are not contiguous"
)
stride *= dtype_shape[-i - 1]
strides = tuple(strides[:-dtype_dims]) or (itemsize,)
shape = tuple(shape[:-dtype_dims]) or (1,)
elif not dtype_is_compatible(dlt.dtype, dtype):
# incompatible dtype requested
raise RuntimeError(f"Incompatible data types: {dlt.dtype} and {dtype}")
a = warp.types.array(
ptr=dlt.data, dtype=dtype, shape=shape, strides=strides, copy=False, owner=False, device=device, pinned=pinned
)
# keep a reference to the capsule so it doesn't get deleted
a._pycapsule = pycapsule
return a