File size: 2,708 Bytes
93277a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import torch
import requests
import numpy as np
import supervision as sv
import cv2
import os
import sys
import imageio.v2 as imageio
from PIL import Image
from transformers import AutoProcessor, RTDetrForObjectDetection, VitPoseForPoseEstimation, infer_device
from vit_pose.vertex_annotator_heart import VertexAnnotatorHeart
from vit_pose.vertex_annotator_star import VertexAnnotatorStar


device = infer_device()

# # Detect humans in the image
person_image_processor = AutoProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
person_model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365", device_map=device)

  # Detect keypoints for each person found
image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-base-simple")
model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple", device_map=device)

def vit_pose_estimation(image, frame_count: int):
  inputs = person_image_processor(images=image, return_tensors="pt").to(person_model.device)

  with torch.no_grad():
      outputs = person_model(**inputs)

  results = person_image_processor.post_process_object_detection(
      outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3
  )
  result = results[0]

  # Human label refers 0 index in COCO dataset
  person_boxes = result["boxes"][result["labels"] == 0]
  person_boxes = person_boxes.cpu().numpy()

  # Convert boxes from VOC (x1, y1, x2, y2) to COCO (x1, y1, w, h) format
  person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0]
  person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1]

  inputs = image_processor(image, boxes=[person_boxes], return_tensors="pt").to(model.device)

  with torch.no_grad():
      outputs = model(**inputs)

  pose_results = image_processor.post_process_pose_estimation(outputs, boxes=[person_boxes])
  image_pose_result = pose_results[0]

  xy = torch.stack([pose_result['keypoints'] for pose_result in image_pose_result]).cpu().numpy()
  scores = torch.stack([pose_result['scores'] for pose_result in image_pose_result]).cpu().numpy()

  color_edge_annotator = sv.Color.from_hex("#e1e1e1")

  color_vertex_annotator = sv.Color.from_hex('#ffc0cb')

  key_points = sv.KeyPoints(
      xy=xy, confidence=scores
  )

  edge_annotator = sv.EdgeAnnotator(
      color=color_edge_annotator,
      thickness=1
  )
  vertex_annotator = VertexAnnotatorHeart(
      color=color_vertex_annotator,
      radius=10
  )
  annotated_frame = edge_annotator.annotate(
      scene=image.copy(),
      key_points=key_points
  )
  annotated_frame = vertex_annotator.annotate(
      scene=annotated_frame,
      key_points=key_points
  )

  return annotated_frame