| import numpy as np |
| import cv2 as cv |
| import argparse |
|
|
| |
| opencv_python_version = lambda str_version: tuple(map(int, (str_version.split(".")))) |
| assert opencv_python_version(cv.__version__) >= opencv_python_version("4.10.0"), \ |
| "Please install latest opencv-python for benchmark: python3 -m pip install --upgrade opencv-python" |
|
|
| from yolox import YoloX |
|
|
| |
| backend_target_pairs = [ |
| [cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_CPU], |
| [cv.dnn.DNN_BACKEND_CUDA, cv.dnn.DNN_TARGET_CUDA], |
| [cv.dnn.DNN_BACKEND_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16], |
| [cv.dnn.DNN_BACKEND_TIMVX, cv.dnn.DNN_TARGET_NPU], |
| [cv.dnn.DNN_BACKEND_CANN, cv.dnn.DNN_TARGET_NPU] |
| ] |
|
|
| classes = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', |
| 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', |
| 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', |
| 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', |
| 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', |
| 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', |
| 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', |
| 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', |
| 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', |
| 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', |
| 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', |
| 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', |
| 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', |
| 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush') |
|
|
| def letterbox(srcimg, target_size=(640, 640)): |
| padded_img = np.ones((target_size[0], target_size[1], 3)).astype(np.float32) * 114.0 |
| ratio = min(target_size[0] / srcimg.shape[0], target_size[1] / srcimg.shape[1]) |
| resized_img = cv.resize( |
| srcimg, (int(srcimg.shape[1] * ratio), int(srcimg.shape[0] * ratio)), interpolation=cv.INTER_LINEAR |
| ).astype(np.float32) |
| padded_img[: int(srcimg.shape[0] * ratio), : int(srcimg.shape[1] * ratio)] = resized_img |
|
|
| return padded_img, ratio |
|
|
| def unletterbox(bbox, letterbox_scale): |
| return bbox / letterbox_scale |
|
|
| def vis(dets, srcimg, letterbox_scale, fps=None): |
| res_img = srcimg.copy() |
|
|
| if fps is not None: |
| fps_label = "FPS: %.2f" % fps |
| cv.putText(res_img, fps_label, (10, 25), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) |
|
|
| for det in dets: |
| box = unletterbox(det[:4], letterbox_scale).astype(np.int32) |
| score = det[-2] |
| cls_id = int(det[-1]) |
|
|
| x0, y0, x1, y1 = box |
|
|
| text = '{}:{:.1f}%'.format(classes[cls_id], score * 100) |
| font = cv.FONT_HERSHEY_SIMPLEX |
| txt_size = cv.getTextSize(text, font, 0.4, 1)[0] |
| cv.rectangle(res_img, (x0, y0), (x1, y1), (0, 255, 0), 2) |
| cv.rectangle(res_img, (x0, y0 + 1), (x0 + txt_size[0] + 1, y0 + int(1.5 * txt_size[1])), (255, 255, 255), -1) |
| cv.putText(res_img, text, (x0, y0 + txt_size[1]), font, 0.4, (0, 0, 0), thickness=1) |
|
|
| return res_img |
|
|
| if __name__=='__main__': |
| parser = argparse.ArgumentParser(description='Nanodet inference using OpenCV an contribution by Sri Siddarth Chakaravarthy part of GSOC_2022') |
| parser.add_argument('--input', '-i', type=str, |
| help='Path to the input image. Omit for using default camera.') |
| parser.add_argument('--model', '-m', type=str, default='object_detection_yolox_2022nov.onnx', |
| help="Path to the model") |
| parser.add_argument('--backend_target', '-bt', type=int, default=0, |
| help='''Choose one of the backend-target pair to run this demo: |
| {:d}: (default) OpenCV implementation + CPU, |
| {:d}: CUDA + GPU (CUDA), |
| {:d}: CUDA + GPU (CUDA FP16), |
| {:d}: TIM-VX + NPU, |
| {:d}: CANN + NPU |
| '''.format(*[x for x in range(len(backend_target_pairs))])) |
| parser.add_argument('--confidence', default=0.5, type=float, |
| help='Class confidence') |
| parser.add_argument('--nms', default=0.5, type=float, |
| help='Enter nms IOU threshold') |
| parser.add_argument('--obj', default=0.5, type=float, |
| help='Enter object threshold') |
| parser.add_argument('--save', '-s', action='store_true', |
| help='Specify to save results. This flag is invalid when using camera.') |
| parser.add_argument('--vis', '-v', action='store_true', |
| help='Specify to open a window for result visualization. This flag is invalid when using camera.') |
| args = parser.parse_args() |
|
|
| backend_id = backend_target_pairs[args.backend_target][0] |
| target_id = backend_target_pairs[args.backend_target][1] |
|
|
| model_net = YoloX(modelPath= args.model, |
| confThreshold=args.confidence, |
| nmsThreshold=args.nms, |
| objThreshold=args.obj, |
| backendId=backend_id, |
| targetId=target_id) |
|
|
| tm = cv.TickMeter() |
| tm.reset() |
| if args.input is not None: |
| image = cv.imread(args.input) |
| input_blob = cv.cvtColor(image, cv.COLOR_BGR2RGB) |
| input_blob, letterbox_scale = letterbox(input_blob) |
|
|
| |
| tm.start() |
| preds = model_net.infer(input_blob) |
| tm.stop() |
| print("Inference time: {:.2f} ms".format(tm.getTimeMilli())) |
|
|
| img = vis(preds, image, letterbox_scale) |
|
|
| if args.save: |
| print('Results saved to result.jpg\n') |
| cv.imwrite('result.jpg', img) |
|
|
| if args.vis: |
| cv.namedWindow(args.input, cv.WINDOW_AUTOSIZE) |
| cv.imshow(args.input, img) |
| cv.waitKey(0) |
|
|
| else: |
| print("Press any key to stop video capture") |
| deviceId = 0 |
| cap = cv.VideoCapture(deviceId) |
|
|
| while cv.waitKey(1) < 0: |
| hasFrame, frame = cap.read() |
| if not hasFrame: |
| print('No frames grabbed!') |
| break |
|
|
| input_blob = cv.cvtColor(frame, cv.COLOR_BGR2RGB) |
| input_blob, letterbox_scale = letterbox(input_blob) |
|
|
| |
| tm.start() |
| preds = model_net.infer(input_blob) |
| tm.stop() |
|
|
| img = vis(preds, frame, letterbox_scale, fps=tm.getFPS()) |
|
|
| cv.imshow("YoloX Demo", img) |
|
|
| tm.reset() |
|
|