fp16/int8 model got wrong answer on onnxruntime python backend

#1
by amoryou - opened

when I run fp16 onnx model, I got run answer compare to fp32 model, I don't get where is wrong, could you provide inference code?

fp16 model result and run log
image
image

Could you please share the inference code you used for this model?

#!/usr/bin/env python

-- coding: utf-8 --

import copy
import time
import argparse

import cv2 as cv
import numpy as np
import onnxruntime

def run_inference(onnx_session, input_size, image):
image_width, image_height = image.shape[1], image.shape[0]

# Pre process:Resize, BGR->RGB, Reshape, float32 cast
input_image = cv.resize(image, dsize=(input_size[1], input_size[0]))
input_image = cv.cvtColor(input_image, cv.COLOR_BGR2RGB)
# int32 input
input_image = input_image[np.newaxis, :, :, :]
input_image = input_image.astype('int32')
print(input_image.dtype, input_image.shape)

# Inference
input_name = onnx_session.get_inputs()[0].name
output_name = onnx_session.get_outputs()[0].name
outputs = onnx_session.run([output_name], {input_name: input_image})

keypoints_with_scores = outputs[0]
keypoints_with_scores = np.squeeze(keypoints_with_scores)

# Postprocess:Calc Keypoint, boounding box
keypoints_list, scores_list = [], []
bbox_list = []
for keypoints_with_score in keypoints_with_scores:
    keypoints = []
    scores = []
    # keypoint
    for index in range(17):
        keypoint_x = int(image_width *
                         keypoints_with_score[(index * 3) + 1])
        keypoint_y = int(image_height *
                         keypoints_with_score[(index * 3) + 0])
        score = keypoints_with_score[(index * 3) + 2]

        keypoints.append([keypoint_x, keypoint_y])
        scores.append(score)

    # bounding box
    bbox_ymin = int(image_height * keypoints_with_score[51])
    bbox_xmin = int(image_width * keypoints_with_score[52])
    bbox_ymax = int(image_height * keypoints_with_score[53])
    bbox_xmax = int(image_width * keypoints_with_score[54])
    bbox_score = keypoints_with_score[55]

    # Add data for 6 people to the list
    keypoints_list.append(keypoints)
    scores_list.append(scores)
    bbox_list.append(
        [bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax, bbox_score])

return keypoints_list, scores_list, bbox_list

def main():
parser = argparse.ArgumentParser()

parser.add_argument(
    "--model",
    type=str,
    default='/Users/bailiqun/Work/h5/model_fp32_Xenova.onnx',
)
parser.add_argument("--input_size", type=str, default='192,256')
parser.add_argument("--keypoint_score", type=float, default=0.1)
parser.add_argument("--bbox_score_th", type=float, default=0.1)
parser.add_argument("--path", type=str, default="./bus.jpg")

args = parser.parse_args()
model_path = args.model
media_path = args.path
image_inference = True
input_size = args.input_size.split(',')
input_size = [int(i) for i in input_size]
keypoint_score_th = args.keypoint_score
bbox_score_th = args.bbox_score_th

if media_path.split('.')[-1] in ['jpg', 'jpeg', 'png', 'bmp', 'gif', 'webp']:
    frame = cv.imread(media_path)
    image_inference = True
else:
    cap = cv.VideoCapture(media_path)
    image_inference = False

# Load model
onnx_session = onnxruntime.InferenceSession(model_path, providers=["CPUExecutionProvider"])

while True:
    start_time = time.time()

    if not image_inference:
        ret, frame = cap.read()
        if not ret:
            break
    debug_image = copy.deepcopy(frame)

    # Inference execution
    keypoints_list, scores_list, bbox_list = run_inference(
        onnx_session,
        input_size,
        frame,
    )

    elapsed_time = time.time() - start_time

    # Draw
    debug_image = draw_debug(
        debug_image,
        elapsed_time,
        keypoint_score_th,
        keypoints_list,
        scores_list,
        bbox_score_th,
        bbox_list,
    )

    key = cv.waitKey(1)
    if key == 27:  # ESC
        break
    cv.imshow('MoveNet(multipose) Demo', debug_image)

cap.release()
cv.destroyAllWindows()

def draw_debug(
image,
elapsed_time,
keypoint_score_th,
keypoints_list,
scores_list,
bbox_score_th,
bbox_list,
):
debug_image = copy.deepcopy(image)

connect_list = [
    [0, 1, (255, 0, 0)],  # nose β†’ left eye
    [0, 2, (0, 0, 255)],  # nose β†’ right eye
    [1, 3, (255, 0, 0)],  # left eye β†’ left ear
    [2, 4, (0, 0, 255)],  # right eye β†’ right ear
    [0, 5, (255, 0, 0)],  # nose β†’ left shoulder
    [0, 6, (0, 0, 255)],  # nose β†’ right shoulder
    [5, 6, (0, 255, 0)],  # left shoulder β†’ right shoulder
    [5, 7, (255, 0, 0)],  # left shoulder β†’ left elbow
    [7, 9, (255, 0, 0)],  # left elbow β†’ left wrist
    [6, 8, (0, 0, 255)],  # right shoulder β†’ right elbow
    [8, 10, (0, 0, 255)],  # right elbow β†’ right wrist
    [11, 12, (0, 255, 0)],  # left hip β†’ right hip
    [5, 11, (255, 0, 0)],  # left shoulder β†’ left hip
    [11, 13, (255, 0, 0)],  # left hip β†’ left knee
    [13, 15, (255, 0, 0)],  # left knee β†’ left ankle
    [6, 12, (0, 0, 255)],  # right shoulder β†’ right hip
    [12, 14, (0, 0, 255)],  # right hip β†’ right knee
    [14, 16, (0, 0, 255)],  # right knee β†’ right ankle
]

for keypoints, scores in zip(keypoints_list, scores_list):
    # Connect Line
    for (index01, index02, color) in connect_list:
        if scores[index01] > keypoint_score_th and scores[
                index02] > keypoint_score_th:
            point01 = keypoints[index01]
            point02 = keypoints[index02]
            cv.line(debug_image, point01, point02, color, 2)

    # Keypoint circle
    for keypoint, score in zip(keypoints, scores):
        if score > keypoint_score_th:
            cv.circle(debug_image, keypoint, 3, (0, 255, 0), -1)

# bounding box
for bbox in bbox_list:
    if bbox[4] > bbox_score_th:
        cv.rectangle(debug_image, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
                     (0, 255, 0), 2)

# Inference elapsed time
cv.putText(debug_image,
           "Elapsed Time : " + '{:.1f}'.format(elapsed_time * 1000) + "ms",
           (10, 30), cv.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2,
           cv.LINE_AA)

return debug_image

if name == 'main':
main()

I can run inference and draw the skeleton correctly with the FP32 model, but when using FP16 or INT8, inference runs, yet the results are incorrect.

Sign up or log in to comment