Implement pycuda backend for inference with TensorRT engine
Browse filesin additon to original polygraphy backend.
the default is polygraphy. You can set TRT_BACKEND to 'PYCUDA' to choose pycuda backend.
- rtmo_gpu.py +125 -19
rtmo_gpu.py
CHANGED
|
@@ -5,6 +5,7 @@ import onnxruntime as ort
|
|
| 5 |
import cv2
|
| 6 |
from queue import Queue
|
| 7 |
os.environ['ORT_TENSORRT_EXTRA_PLUGIN_LIB_PATHS']='libmmdeploy_tensorrt_ops.so'
|
|
|
|
| 8 |
|
| 9 |
# dictionary from https://github.com/Tau-J/rtmlib/blob/4b29101d54b611048ef165277cebfffff3030074/rtmlib/visualization/skeleton/coco17.py
|
| 10 |
coco17 = dict(name='coco17',
|
|
@@ -442,17 +443,39 @@ class RTMO_GPU(object):
|
|
| 442 |
outputs = [output.numpy() for output in io_binding.get_outputs()]
|
| 443 |
|
| 444 |
else: # 'engine'
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
|
| 452 |
return outputs
|
| 453 |
|
| 454 |
def __exit__(self):
|
| 455 |
-
if self.model_format == 'engine':
|
| 456 |
if self.session.is_active:
|
| 457 |
self.session.deactivate()
|
| 458 |
|
|
@@ -471,7 +494,11 @@ class RTMO_GPU(object):
|
|
| 471 |
mean: tuple = None,
|
| 472 |
std: tuple = None,
|
| 473 |
device: str = 'cuda',
|
| 474 |
-
is_yolo_nas_pose = False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 475 |
|
| 476 |
if not os.path.exists(model):
|
| 477 |
# If the file does not exist, raise FileNotFoundError
|
|
@@ -499,10 +526,62 @@ class RTMO_GPU(object):
|
|
| 499 |
providers=providers[device])
|
| 500 |
|
| 501 |
else: # 'engine'
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
|
| 507 |
self.model_input_size = self.input_shape[2:4] # B, C, H, W,
|
| 508 |
self.mean = mean
|
|
@@ -510,6 +589,9 @@ class RTMO_GPU(object):
|
|
| 510 |
self.device = device
|
| 511 |
self.is_yolo_nas_pose = is_yolo_nas_pose
|
| 512 |
|
|
|
|
|
|
|
|
|
|
| 513 |
class RTMO_GPU_Batch(RTMO_GPU):
|
| 514 |
def preprocess_batch(self, imgs: List[np.ndarray]) -> Tuple[np.ndarray, List[float]]:
|
| 515 |
"""Process a batch of images for RTMPose model inference.
|
|
@@ -571,12 +653,34 @@ class RTMO_GPU_Batch(RTMO_GPU):
|
|
| 571 |
outputs = [output.numpy() for output in io_binding.get_outputs()]
|
| 572 |
|
| 573 |
else: # 'engine'
|
|
|
|
|
|
|
|
|
|
| 574 |
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 580 |
|
| 581 |
return outputs
|
| 582 |
|
|
@@ -651,14 +755,16 @@ class RTMO_GPU_Batch(RTMO_GPU):
|
|
| 651 |
std: tuple = None,
|
| 652 |
device: str = 'cuda',
|
| 653 |
is_yolo_nas_pose = False,
|
|
|
|
| 654 |
batch_size: int = 1):
|
| 655 |
super().__init__(model,
|
| 656 |
mean,
|
| 657 |
std,
|
| 658 |
device,
|
| 659 |
-
is_yolo_nas_pose
|
|
|
|
|
|
|
| 660 |
|
| 661 |
-
self.batch_size = batch_size
|
| 662 |
self.in_queues = dict()
|
| 663 |
self.out_queues = dict()
|
| 664 |
self.buffers = dict()
|
|
|
|
| 5 |
import cv2
|
| 6 |
from queue import Queue
|
| 7 |
os.environ['ORT_TENSORRT_EXTRA_PLUGIN_LIB_PATHS']='libmmdeploy_tensorrt_ops.so'
|
| 8 |
+
TRT_BACKEND='POLYGRAPHY'
|
| 9 |
|
| 10 |
# dictionary from https://github.com/Tau-J/rtmlib/blob/4b29101d54b611048ef165277cebfffff3030074/rtmlib/visualization/skeleton/coco17.py
|
| 11 |
coco17 = dict(name='coco17',
|
|
|
|
| 443 |
outputs = [output.numpy() for output in io_binding.get_outputs()]
|
| 444 |
|
| 445 |
else: # 'engine'
|
| 446 |
+
if TRT_BACKEND == 'POLYGRAPHY':
|
| 447 |
+
if not self.session.is_active:
|
| 448 |
+
self.session.activate()
|
| 449 |
+
|
| 450 |
+
outputs = self.session.infer(feed_dict={'input': input}, check_inputs=False)
|
| 451 |
+
outputs = [output for output in outputs.values()]
|
| 452 |
+
else: # PYCUDA
|
| 453 |
+
import pycuda.driver as cuda
|
| 454 |
+
# Set the input shape dynamically
|
| 455 |
+
input_shape = input.shape
|
| 456 |
+
self.context.set_binding_shape(0, input_shape)
|
| 457 |
+
|
| 458 |
+
# Ensure input_data matches the expected shape
|
| 459 |
+
np.copyto(self.inputs[0]['host'], input.ravel())
|
| 460 |
+
cuda.memcpy_htod_async(self.inputs[0]['device'], self.inputs[0]['host'], self.stream)
|
| 461 |
+
|
| 462 |
+
# Run inference
|
| 463 |
+
self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle)
|
| 464 |
+
|
| 465 |
+
# Transfer predictions back from the GPU
|
| 466 |
+
for output in self.outputs:
|
| 467 |
+
cuda.memcpy_dtoh_async(output['host'], output['device'], self.stream)
|
| 468 |
+
|
| 469 |
+
# Synchronize the stream
|
| 470 |
+
self.stream.synchronize()
|
| 471 |
+
|
| 472 |
+
# Return only the output values (in their original shapes)
|
| 473 |
+
outputs = [out['host'].reshape(out['shape']) for out in self.outputs]
|
| 474 |
|
| 475 |
return outputs
|
| 476 |
|
| 477 |
def __exit__(self):
|
| 478 |
+
if self.model_format == 'engine' and TRT_BACKEND == 'POLYGRAPHY':
|
| 479 |
if self.session.is_active:
|
| 480 |
self.session.deactivate()
|
| 481 |
|
|
|
|
| 494 |
mean: tuple = None,
|
| 495 |
std: tuple = None,
|
| 496 |
device: str = 'cuda',
|
| 497 |
+
is_yolo_nas_pose = False,
|
| 498 |
+
batch_size = 1,
|
| 499 |
+
plugin_path = 'libmmdeploy_tensorrt_ops.so'):
|
| 500 |
+
|
| 501 |
+
self.batch_size = batch_size
|
| 502 |
|
| 503 |
if not os.path.exists(model):
|
| 504 |
# If the file does not exist, raise FileNotFoundError
|
|
|
|
| 526 |
providers=providers[device])
|
| 527 |
|
| 528 |
else: # 'engine'
|
| 529 |
+
if TRT_BACKEND == 'POLYGRAPHY':
|
| 530 |
+
from polygraphy.backend.common import BytesFromPath
|
| 531 |
+
from polygraphy.backend.trt import EngineFromBytes, TrtRunner
|
| 532 |
+
engine = EngineFromBytes(BytesFromPath(model))
|
| 533 |
+
self.session = TrtRunner(engine)
|
| 534 |
+
else: # PYCUDA
|
| 535 |
+
import tensorrt as trt
|
| 536 |
+
import ctypes
|
| 537 |
+
import pycuda.autoinit
|
| 538 |
+
import pycuda.driver as cuda
|
| 539 |
+
self.TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
|
| 540 |
+
self.trt_model_path = model
|
| 541 |
+
self.plugin_path = plugin_path
|
| 542 |
+
|
| 543 |
+
# Load the custom plugin library
|
| 544 |
+
ctypes.CDLL(self.plugin_path)
|
| 545 |
+
|
| 546 |
+
# Load the TensorRT engine
|
| 547 |
+
with open(self.trt_model_path, 'rb') as f:
|
| 548 |
+
engine_data = f.read()
|
| 549 |
+
|
| 550 |
+
self.runtime = trt.Runtime(self.TRT_LOGGER)
|
| 551 |
+
self.engine = self.runtime.deserialize_cuda_engine(engine_data)
|
| 552 |
+
|
| 553 |
+
if self.engine is None:
|
| 554 |
+
raise RuntimeError("Failed to load the engine.")
|
| 555 |
+
|
| 556 |
+
self.context = self.engine.create_execution_context()
|
| 557 |
+
|
| 558 |
+
self.inputs = []
|
| 559 |
+
self.outputs = []
|
| 560 |
+
self.bindings = []
|
| 561 |
+
self.stream = cuda.Stream()
|
| 562 |
+
|
| 563 |
+
# Allocate memory for inputs and outputs
|
| 564 |
+
for binding in self.engine:
|
| 565 |
+
binding_index = self.engine.get_binding_index(binding)
|
| 566 |
+
shape = self.engine.get_binding_shape(binding_index)
|
| 567 |
+
if shape[0] == -1:
|
| 568 |
+
# Handle dynamic batch size by setting max_batch_size
|
| 569 |
+
shape[0] = self.batch_size
|
| 570 |
+
size = trt.volume(shape)
|
| 571 |
+
dtype = trt.nptype(self.engine.get_binding_dtype(binding))
|
| 572 |
+
|
| 573 |
+
# Allocate host and device buffers
|
| 574 |
+
host_mem = cuda.pagelocked_empty(size, dtype)
|
| 575 |
+
device_mem = cuda.mem_alloc(host_mem.nbytes)
|
| 576 |
+
|
| 577 |
+
# Append the device buffer to device bindings.
|
| 578 |
+
self.bindings.append(int(device_mem))
|
| 579 |
+
|
| 580 |
+
# Append to the appropriate list.
|
| 581 |
+
if self.engine.binding_is_input(binding):
|
| 582 |
+
self.inputs.append({'host': host_mem, 'device': device_mem, 'shape': shape})
|
| 583 |
+
else:
|
| 584 |
+
self.outputs.append({'host': host_mem, 'device': device_mem, 'shape': shape})
|
| 585 |
|
| 586 |
self.model_input_size = self.input_shape[2:4] # B, C, H, W,
|
| 587 |
self.mean = mean
|
|
|
|
| 589 |
self.device = device
|
| 590 |
self.is_yolo_nas_pose = is_yolo_nas_pose
|
| 591 |
|
| 592 |
+
print(f'[I] Detected \'{self.model_format.upper()}\' model', end='')
|
| 593 |
+
print(f', \'{TRT_BACKEND.upper()}\' backend is chosen for inference' if self.model_format == 'engine' else '')
|
| 594 |
+
|
| 595 |
class RTMO_GPU_Batch(RTMO_GPU):
|
| 596 |
def preprocess_batch(self, imgs: List[np.ndarray]) -> Tuple[np.ndarray, List[float]]:
|
| 597 |
"""Process a batch of images for RTMPose model inference.
|
|
|
|
| 653 |
outputs = [output.numpy() for output in io_binding.get_outputs()]
|
| 654 |
|
| 655 |
else: # 'engine'
|
| 656 |
+
if TRT_BACKEND == 'POLYGRAPHY':
|
| 657 |
+
if not self.session.is_active:
|
| 658 |
+
self.session.activate()
|
| 659 |
|
| 660 |
+
outputs = self.session.infer(feed_dict={'input': input}, check_inputs=False)
|
| 661 |
+
outputs = [output for output in outputs.values()]
|
| 662 |
+
else: # PYCUDA
|
| 663 |
+
import pycuda.driver as cuda
|
| 664 |
+
# Set the input shape dynamically
|
| 665 |
+
input_shape = input.shape
|
| 666 |
+
self.context.set_binding_shape(0, input_shape)
|
| 667 |
+
|
| 668 |
+
# Ensure input_data matches the expected shape
|
| 669 |
+
np.copyto(self.inputs[0]['host'], input.ravel())
|
| 670 |
+
cuda.memcpy_htod_async(self.inputs[0]['device'], self.inputs[0]['host'], self.stream)
|
| 671 |
+
|
| 672 |
+
# Run inference
|
| 673 |
+
self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle)
|
| 674 |
+
|
| 675 |
+
# Transfer predictions back from the GPU
|
| 676 |
+
for output in self.outputs:
|
| 677 |
+
cuda.memcpy_dtoh_async(output['host'], output['device'], self.stream)
|
| 678 |
+
|
| 679 |
+
# Synchronize the stream
|
| 680 |
+
self.stream.synchronize()
|
| 681 |
+
|
| 682 |
+
# Return only the output values (in their original shapes)
|
| 683 |
+
outputs = [out['host'].reshape(out['shape']) for out in self.outputs]
|
| 684 |
|
| 685 |
return outputs
|
| 686 |
|
|
|
|
| 755 |
std: tuple = None,
|
| 756 |
device: str = 'cuda',
|
| 757 |
is_yolo_nas_pose = False,
|
| 758 |
+
plugin_path = 'libmmdeploy_tensorrt_ops.so',
|
| 759 |
batch_size: int = 1):
|
| 760 |
super().__init__(model,
|
| 761 |
mean,
|
| 762 |
std,
|
| 763 |
device,
|
| 764 |
+
is_yolo_nas_pose,
|
| 765 |
+
batch_size,
|
| 766 |
+
plugin_path)
|
| 767 |
|
|
|
|
| 768 |
self.in_queues = dict()
|
| 769 |
self.out_queues = dict()
|
| 770 |
self.buffers = dict()
|