Luigi commited on
Commit
43a489d
·
1 Parent(s): e1ce29b

Optimization for CUDAExecutionProvider

Browse files

- Use IO Binding & Enable Use MAX Workspace to Optimize for RTMO
- Eliminate the dependency on rtmlib

Files changed (2) hide show
  1. rtmo_demo.py +1 -3
  2. rtmo_gpu.py +369 -44
rtmo_demo.py CHANGED
@@ -2,11 +2,10 @@
2
 
3
  import time
4
  import cv2
5
- from rtmlib import draw_skeleton
6
  from pathlib import Path
7
  import argparse
8
  import os
9
- from rtmo_gpu import RTMO_GPU
10
 
11
  if __name__ == "__main__":
12
 
@@ -52,7 +51,6 @@ if __name__ == "__main__":
52
  img_show = draw_skeleton(img_show,
53
  keypoints,
54
  scores,
55
- openpose_skeleton=False,
56
  kpt_thr=0.3,
57
  line_width=2)
58
  img_show = cv2.resize(img_show, (788, 525))
 
2
 
3
  import time
4
  import cv2
 
5
  from pathlib import Path
6
  import argparse
7
  import os
8
+ from rtmo_gpu import RTMO_GPU, draw_skeleton
9
 
10
  if __name__ == "__main__":
11
 
 
51
  img_show = draw_skeleton(img_show,
52
  keypoints,
53
  scores,
 
54
  kpt_thr=0.3,
55
  line_width=2)
56
  img_show = cv2.resize(img_show, (788, 525))
rtmo_gpu.py CHANGED
@@ -1,55 +1,380 @@
1
  import os
2
- from rtmlib import RTMO
 
 
 
3
 
4
- class RTMO_GPU(RTMO):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  def __init__(self,
7
  onnx_model: str = None,
8
  model_input_size: tuple = (640, 640),
9
  mean: tuple = None,
10
  std: tuple = None,
11
- to_openpose: bool = False,
12
- backend: str = 'onnxruntime',
13
  device: str = 'cuda'):
14
 
15
- if backend == 'onnxruntime':
16
-
17
- if not os.path.exists(onnx_model):
18
- from rtmlib.tools.file import download_checkpoint
19
- onnx_model = download_checkpoint(onnx_model)
20
-
21
- import onnxruntime as ort
22
-
23
- providers = {'cpu': 'CPUExecutionProvider',
24
- 'cuda': [
25
- ('CUDAExecutionProvider', {
26
- 'device_id': 0,
27
- 'arena_extend_strategy': 'kNextPowerOfTwo',
28
- 'gpu_mem_limit': 2 * 1024 * 1024 * 1024,
29
- 'cudnn_conv_algo_search': 'DEFAULT',
30
- 'do_copy_in_default_stream': True,
31
- 'enable_cuda_graph': False
32
- }),
33
- 'CPUExecutionProvider']}
34
-
35
- self.session = ort.InferenceSession(path_or_bytes=onnx_model,
36
- providers=providers[device])
37
-
38
- print(f'load {onnx_model} with {backend} backend')
39
-
40
- self.onnx_model = onnx_model
41
- self.model_input_size = model_input_size
42
- self.mean = mean
43
- self.std = std
44
- self.backend = backend
45
- self.device = device
46
- self.to_openpose = to_openpose
47
 
48
- else:
49
- super().__init__(onnx_model,
50
- model_input_size,
51
- mean,
52
- std,
53
- to_openpose,
54
- backend,
55
- device)
 
1
  import os
2
+ import numpy as np
3
+ from typing import List, Tuple
4
+ import onnxruntime as ort
5
+ import cv2
6
 
7
+ # dictionary from https://github.com/Tau-J/rtmlib/blob/4b29101d54b611048ef165277cebfffff3030074/rtmlib/visualization/skeleton/coco17.py
8
+ coco17 = dict(name='coco17',
9
+ keypoint_info={
10
+ 0:
11
+ dict(name='nose', id=0, color=[51, 153, 255], swap=''),
12
+ 1:
13
+ dict(name='left_eye',
14
+ id=1,
15
+ color=[51, 153, 255],
16
+ swap='right_eye'),
17
+ 2:
18
+ dict(name='right_eye',
19
+ id=2,
20
+ color=[51, 153, 255],
21
+ swap='left_eye'),
22
+ 3:
23
+ dict(name='left_ear',
24
+ id=3,
25
+ color=[51, 153, 255],
26
+ swap='right_ear'),
27
+ 4:
28
+ dict(name='right_ear',
29
+ id=4,
30
+ color=[51, 153, 255],
31
+ swap='left_ear'),
32
+ 5:
33
+ dict(name='left_shoulder',
34
+ id=5,
35
+ color=[0, 255, 0],
36
+ swap='right_shoulder'),
37
+ 6:
38
+ dict(name='right_shoulder',
39
+ id=6,
40
+ color=[255, 128, 0],
41
+ swap='left_shoulder'),
42
+ 7:
43
+ dict(name='left_elbow',
44
+ id=7,
45
+ color=[0, 255, 0],
46
+ swap='right_elbow'),
47
+ 8:
48
+ dict(name='right_elbow',
49
+ id=8,
50
+ color=[255, 128, 0],
51
+ swap='left_elbow'),
52
+ 9:
53
+ dict(name='left_wrist',
54
+ id=9,
55
+ color=[0, 255, 0],
56
+ swap='right_wrist'),
57
+ 10:
58
+ dict(name='right_wrist',
59
+ id=10,
60
+ color=[255, 128, 0],
61
+ swap='left_wrist'),
62
+ 11:
63
+ dict(name='left_hip',
64
+ id=11,
65
+ color=[0, 255, 0],
66
+ swap='right_hip'),
67
+ 12:
68
+ dict(name='right_hip',
69
+ id=12,
70
+ color=[255, 128, 0],
71
+ swap='left_hip'),
72
+ 13:
73
+ dict(name='left_knee',
74
+ id=13,
75
+ color=[0, 255, 0],
76
+ swap='right_knee'),
77
+ 14:
78
+ dict(name='right_knee',
79
+ id=14,
80
+ color=[255, 128, 0],
81
+ swap='left_knee'),
82
+ 15:
83
+ dict(name='left_ankle',
84
+ id=15,
85
+ color=[0, 255, 0],
86
+ swap='right_ankle'),
87
+ 16:
88
+ dict(name='right_ankle',
89
+ id=16,
90
+ color=[255, 128, 0],
91
+ swap='left_ankle')
92
+ },
93
+ skeleton_info={
94
+ 0:
95
+ dict(link=('left_ankle', 'left_knee'),
96
+ id=0,
97
+ color=[0, 255, 0]),
98
+ 1:
99
+ dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255,
100
+ 0]),
101
+ 2:
102
+ dict(link=('right_ankle', 'right_knee'),
103
+ id=2,
104
+ color=[255, 128, 0]),
105
+ 3:
106
+ dict(link=('right_knee', 'right_hip'),
107
+ id=3,
108
+ color=[255, 128, 0]),
109
+ 4:
110
+ dict(link=('left_hip', 'right_hip'),
111
+ id=4,
112
+ color=[51, 153, 255]),
113
+ 5:
114
+ dict(link=('left_shoulder', 'left_hip'),
115
+ id=5,
116
+ color=[51, 153, 255]),
117
+ 6:
118
+ dict(link=('right_shoulder', 'right_hip'),
119
+ id=6,
120
+ color=[51, 153, 255]),
121
+ 7:
122
+ dict(link=('left_shoulder', 'right_shoulder'),
123
+ id=7,
124
+ color=[51, 153, 255]),
125
+ 8:
126
+ dict(link=('left_shoulder', 'left_elbow'),
127
+ id=8,
128
+ color=[0, 255, 0]),
129
+ 9:
130
+ dict(link=('right_shoulder', 'right_elbow'),
131
+ id=9,
132
+ color=[255, 128, 0]),
133
+ 10:
134
+ dict(link=('left_elbow', 'left_wrist'),
135
+ id=10,
136
+ color=[0, 255, 0]),
137
+ 11:
138
+ dict(link=('right_elbow', 'right_wrist'),
139
+ id=11,
140
+ color=[255, 128, 0]),
141
+ 12:
142
+ dict(link=('left_eye', 'right_eye'),
143
+ id=12,
144
+ color=[51, 153, 255]),
145
+ 13:
146
+ dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
147
+ 14:
148
+ dict(link=('nose', 'right_eye'), id=14, color=[51, 153,
149
+ 255]),
150
+ 15:
151
+ dict(link=('left_eye', 'left_ear'),
152
+ id=15,
153
+ color=[51, 153, 255]),
154
+ 16:
155
+ dict(link=('right_eye', 'right_ear'),
156
+ id=16,
157
+ color=[51, 153, 255]),
158
+ 17:
159
+ dict(link=('left_ear', 'left_shoulder'),
160
+ id=17,
161
+ color=[51, 153, 255]),
162
+ 18:
163
+ dict(link=('right_ear', 'right_shoulder'),
164
+ id=18,
165
+ color=[51, 153, 255])
166
+ })
167
 
168
+ # functions from https://github.com/Tau-J/rtmlib/blob/4b29101d54b611048ef165277cebfffff3030074/rtmlib/visualization/draw.py#L71
169
+ def draw_mmpose(img,
170
+ keypoints,
171
+ scores,
172
+ keypoint_info,
173
+ skeleton_info,
174
+ kpt_thr=0.5,
175
+ radius=2,
176
+ line_width=2):
177
+ assert len(keypoints.shape) == 2
178
+
179
+ vis_kpt = [s >= kpt_thr for s in scores]
180
+
181
+ link_dict = {}
182
+ for i, kpt_info in keypoint_info.items():
183
+ kpt_color = tuple(kpt_info['color'])
184
+ link_dict[kpt_info['name']] = kpt_info['id']
185
+
186
+ kpt = keypoints[i]
187
+
188
+ if vis_kpt[i]:
189
+ img = cv2.circle(img, (int(kpt[0]), int(kpt[1])), int(radius),
190
+ kpt_color, -1)
191
+
192
+ for i, ske_info in skeleton_info.items():
193
+ link = ske_info['link']
194
+ pt0, pt1 = link_dict[link[0]], link_dict[link[1]]
195
+
196
+ if vis_kpt[pt0] and vis_kpt[pt1]:
197
+ link_color = ske_info['color']
198
+ kpt0 = keypoints[pt0]
199
+ kpt1 = keypoints[pt1]
200
+
201
+ img = cv2.line(img, (int(kpt0[0]), int(kpt0[1])),
202
+ (int(kpt1[0]), int(kpt1[1])),
203
+ link_color,
204
+ thickness=line_width)
205
+
206
+ return img
207
+
208
+ # with simplification to use onnxruntime only
209
+ def draw_skeleton(img,
210
+ keypoints,
211
+ scores,
212
+ kpt_thr=0.5,
213
+ radius=2,
214
+ line_width=2):
215
+ num_keypoints = keypoints.shape[1]
216
+
217
+ if num_keypoints == 17:
218
+ skeleton = 'coco17'
219
+ else:
220
+ raise NotImplementedError
221
+
222
+ skeleton_dict = eval(f'{skeleton}')
223
+ keypoint_info = skeleton_dict['keypoint_info']
224
+ skeleton_info = skeleton_dict['skeleton_info']
225
+
226
+ if len(keypoints.shape) == 2:
227
+ keypoints = keypoints[None, :, :]
228
+ scores = scores[None, :, :]
229
+
230
+ num_instance = keypoints.shape[0]
231
+ if skeleton in ['coco17']:
232
+ for i in range(num_instance):
233
+ img = draw_mmpose(img, keypoints[i], scores[i], keypoint_info,
234
+ skeleton_info, kpt_thr, radius, line_width)
235
+ else:
236
+ raise NotImplementedError
237
+ return img
238
+
239
+ class RTMO_GPU(object):
240
+
241
+ def preprocess(self, img: np.ndarray):
242
+ """Do preprocessing for RTMPose model inference.
243
+
244
+ Args:
245
+ img (np.ndarray): Input image in shape.
246
+
247
+ Returns:
248
+ tuple:
249
+ - resized_img (np.ndarray): Preprocessed image.
250
+ - center (np.ndarray): Center of image.
251
+ - scale (np.ndarray): Scale of image.
252
+ """
253
+ if len(img.shape) == 3:
254
+ padded_img = np.ones(
255
+ (self.model_input_size[0], self.model_input_size[1], 3),
256
+ dtype=np.uint8) * 114
257
+ else:
258
+ padded_img = np.ones(self.model_input_size, dtype=np.uint8) * 114
259
+
260
+ ratio = min(self.model_input_size[0] / img.shape[0],
261
+ self.model_input_size[1] / img.shape[1])
262
+ resized_img = cv2.resize(
263
+ img,
264
+ (int(img.shape[1] * ratio), int(img.shape[0] * ratio)),
265
+ interpolation=cv2.INTER_LINEAR,
266
+ ).astype(np.uint8)
267
+ padded_shape = (int(img.shape[0] * ratio), int(img.shape[1] * ratio))
268
+ padded_img[:padded_shape[0], :padded_shape[1]] = resized_img
269
+
270
+ # normalize image
271
+ if self.mean is not None:
272
+ self.mean = np.array(self.mean)
273
+ self.std = np.array(self.std)
274
+ padded_img = (padded_img - self.mean) / self.std
275
+
276
+ return padded_img, ratio
277
+
278
+ def postprocess(
279
+ self,
280
+ outputs: List[np.ndarray],
281
+ ratio: float = 1.,
282
+ ) -> Tuple[np.ndarray, np.ndarray]:
283
+ """Do postprocessing for RTMO model inference.
284
+
285
+ Args:
286
+ outputs (List[np.ndarray]): Outputs of RTMO model.
287
+ ratio (float): Ratio of preprocessing.
288
+
289
+ Returns:
290
+ tuple:
291
+ - final_boxes (np.ndarray): Final bounding boxes.
292
+ - final_scores (np.ndarray): Final scores.
293
+ """
294
+ det_outputs, pose_outputs = outputs
295
+
296
+ # onnx contains nms module
297
+ pack_dets = (det_outputs[0, :, :4], det_outputs[0, :, 4])
298
+ final_boxes, final_scores = pack_dets
299
+ final_boxes /= ratio
300
+ isscore = final_scores > 0.3
301
+ isbbox = [i for i in isscore]
302
+ # final_boxes = final_boxes[isbbox]
303
+
304
+ # decode pose outputs
305
+ keypoints, scores = pose_outputs[0, :, :, :2], pose_outputs[0, :, :, 2]
306
+ keypoints = keypoints / ratio
307
+
308
+ keypoints = keypoints[isbbox]
309
+ scores = scores[isbbox]
310
+
311
+ return keypoints, scores
312
+
313
+ def inference(self, img: np.ndarray):
314
+ """Inference model.
315
+
316
+ Args:
317
+ img (np.ndarray): Input image in shape.
318
+
319
+ Returns:
320
+ outputs (np.ndarray): Output of RTMPose model.
321
+ """
322
+ # build input to (1, 3, H, W)
323
+ img = img.transpose(2, 0, 1)
324
+ img = np.ascontiguousarray(img, dtype=np.float32)
325
+ input = img[None, :, :, :]
326
+
327
+ # Create an IO Binding object
328
+ io_binding = self.session.io_binding()
329
+
330
+ # Bind the model inputs and outputs to the IO Binding object
331
+ io_binding.bind_input(name='input', device_type='cpu', device_id=0, element_type=np.float32, shape=input.shape, buffer_ptr=input.ctypes.data)
332
+ io_binding.bind_output(name='dets')
333
+ io_binding.bind_output(name='keypoints')
334
+
335
+ # Run inference with IO Binding
336
+ self.session.run_with_iobinding(io_binding)
337
+
338
+ # Retrieve the outputs from the IO Binding object
339
+ outputs = [output.numpy() for output in io_binding.get_outputs()]
340
+
341
+ return outputs
342
+
343
+ def __call__(self, image: np.ndarray):
344
+ image, ratio = self.preprocess(image)
345
+
346
+
347
+ outputs = self.inference(image)
348
+
349
+ keypoints, scores = self.postprocess(outputs, ratio)
350
+
351
+ return keypoints, scores
352
+
353
  def __init__(self,
354
  onnx_model: str = None,
355
  model_input_size: tuple = (640, 640),
356
  mean: tuple = None,
357
  std: tuple = None,
 
 
358
  device: str = 'cuda'):
359
 
360
+ if not os.path.exists(onnx_model):
361
+ from rtmlib.tools.file import download_checkpoint
362
+ onnx_model = download_checkpoint(onnx_model)
363
+
364
+ providers = {'cpu': 'CPUExecutionProvider',
365
+ 'cuda': [
366
+ ('CUDAExecutionProvider', {
367
+ 'cudnn_conv_algo_search': 'DEFAULT',
368
+ 'cudnn_conv_use_max_workspace': True
369
+ }),
370
+ 'CPUExecutionProvider']}
371
+
372
+ self.session = ort.InferenceSession(path_or_bytes=onnx_model,
373
+ providers=providers[device])
374
+
375
+ self.onnx_model = onnx_model
376
+ self.model_input_size = model_input_size
377
+ self.mean = mean
378
+ self.std = std
379
+ self.device = device
 
 
 
 
 
 
 
 
 
 
 
 
380