Spaces:
Sleeping
Sleeping
| import torch | |
| import requests | |
| import numpy as np | |
| import supervision as sv | |
| import cv2 | |
| import os | |
| import sys | |
| import imageio.v2 as imageio | |
| from PIL import Image | |
| from transformers import AutoProcessor, RTDetrForObjectDetection, VitPoseForPoseEstimation, infer_device | |
| from vit_pose.vertex_annotator_heart import VertexAnnotatorHeart | |
| from vit_pose.vertex_annotator_star import VertexAnnotatorStar | |
| device = infer_device() | |
| # # Detect humans in the image | |
| person_image_processor = AutoProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365") | |
| person_model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365", device_map=device) | |
| # Detect keypoints for each person found | |
| image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-base-simple") | |
| model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple", device_map=device) | |
| def vit_pose_estimation(image, frame_count: int): | |
| inputs = person_image_processor(images=image, return_tensors="pt").to(person_model.device) | |
| with torch.no_grad(): | |
| outputs = person_model(**inputs) | |
| results = person_image_processor.post_process_object_detection( | |
| outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3 | |
| ) | |
| result = results[0] | |
| # Human label refers 0 index in COCO dataset | |
| person_boxes = result["boxes"][result["labels"] == 0] | |
| person_boxes = person_boxes.cpu().numpy() | |
| # Convert boxes from VOC (x1, y1, x2, y2) to COCO (x1, y1, w, h) format | |
| person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0] | |
| person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1] | |
| inputs = image_processor(image, boxes=[person_boxes], return_tensors="pt").to(model.device) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| pose_results = image_processor.post_process_pose_estimation(outputs, boxes=[person_boxes]) | |
| image_pose_result = pose_results[0] | |
| xy = torch.stack([pose_result['keypoints'] for pose_result in image_pose_result]).cpu().numpy() | |
| scores = torch.stack([pose_result['scores'] for pose_result in image_pose_result]).cpu().numpy() | |
| color_edge_annotator = sv.Color.from_hex("#e1e1e1") | |
| color_vertex_annotator = sv.Color.from_hex('#ffc0cb') | |
| key_points = sv.KeyPoints( | |
| xy=xy, confidence=scores | |
| ) | |
| edge_annotator = sv.EdgeAnnotator( | |
| color=color_edge_annotator, | |
| thickness=1 | |
| ) | |
| vertex_annotator = VertexAnnotatorHeart( | |
| color=color_vertex_annotator, | |
| radius=10 | |
| ) | |
| annotated_frame = edge_annotator.annotate( | |
| scene=image.copy(), | |
| key_points=key_points | |
| ) | |
| annotated_frame = vertex_annotator.annotate( | |
| scene=annotated_frame, | |
| key_points=key_points | |
| ) | |
| return annotated_frame | |