fed-vitpose / inference.py
MatthewTL's picture
Add ViTPoser intference API
f9f18ae
from transformers import (
VitPoseForPoseEstimation,
AutoProcessor,
RTDetrForObjectDetection,
)
from PIL import Image
import torch
# load models
det_proc = AutoProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
det_model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365").eval()
pose_proc = AutoProcessor.from_pretrained("usyd-community/vitpose-base-simple")
pose_model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple").eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
det_model.to(device)
pose_model.to(device)
# Hugging Face will call this function
def predict(inputs: dict) -> dict:
"""
inputs: {"image": PIL.Image}
returns: {"poses": ...}
"""
image = inputs["image"]
# detect people
det_inputs = det_proc(images=image, return_tensors="pt").to(device)
det_outputs = det_model(**det_inputs)
results = det_proc.post_process_object_detection(
det_outputs,
threshold=0.5,
target_sizes=[(image.height, image.width)]
)
# keep only "person" class (label 0)
person_boxes = results[0]["boxes"][results[0]["labels"] == 0]
# run pose estimation
pose_inputs = pose_proc(image, boxes=[person_boxes], return_tensors="pt").to(device)
with torch.no_grad():
pose_outputs = pose_model(**pose_inputs)
poses = pose_proc.post_process_pose_estimation(pose_outputs, boxes=[person_boxes])
return {"poses": poses[0]}