File size: 2,664 Bytes
93277a5
 
2557bea
 
 
 
 
 
93277a5
 
 
 
 
 
 
2557bea
 
 
93277a5
2557bea
93277a5
2557bea
 
 
93277a5
 
2557bea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import torch
import supervision as sv
from transformers import (
    AutoProcessor,
    RTDetrForObjectDetection,
    VitPoseForPoseEstimation,
    infer_device,
)
from vit_pose.vertex_annotator_heart import VertexAnnotatorHeart


device = infer_device()

# # Detect humans in the image
person_image_processor = AutoProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
person_model = RTDetrForObjectDetection.from_pretrained(
    "PekingU/rtdetr_r50vd_coco_o365", device_map=device
)

# Detect keypoints for each person found
image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-base-simple")
model = VitPoseForPoseEstimation.from_pretrained(
    "usyd-community/vitpose-base-simple", device_map=device
)


def vit_pose_estimation(image, frame_count: int):
    inputs = person_image_processor(images=image, return_tensors="pt").to(
        person_model.device
    )

    with torch.no_grad():
        outputs = person_model(**inputs)

    results = person_image_processor.post_process_object_detection(
        outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3
    )
    result = results[0]

    # Human label refers 0 index in COCO dataset
    person_boxes = result["boxes"][result["labels"] == 0]
    person_boxes = person_boxes.cpu().numpy()

    # Convert boxes from VOC (x1, y1, x2, y2) to COCO (x1, y1, w, h) format
    person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0]
    person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1]

    inputs = image_processor(image, boxes=[person_boxes], return_tensors="pt").to(
        model.device
    )

    with torch.no_grad():
        outputs = model(**inputs)

    pose_results = image_processor.post_process_pose_estimation(
        outputs, boxes=[person_boxes]
    )
    image_pose_result = pose_results[0]

    xy = (
        torch.stack([pose_result["keypoints"] for pose_result in image_pose_result])
        .cpu()
        .numpy()
    )
    scores = (
        torch.stack([pose_result["scores"] for pose_result in image_pose_result])
        .cpu()
        .numpy()
    )

    color_edge_annotator = sv.Color.from_hex("#e1e1e1")

    color_vertex_annotator = sv.Color.from_hex("#ffc0cb")

    key_points = sv.KeyPoints(xy=xy, confidence=scores)

    edge_annotator = sv.EdgeAnnotator(color=color_edge_annotator, thickness=1)
    vertex_annotator = VertexAnnotatorHeart(color=color_vertex_annotator, radius=10)
    annotated_frame = edge_annotator.annotate(scene=image.copy(), key_points=key_points)
    annotated_frame = vertex_annotator.annotate(
        scene=annotated_frame, key_points=key_points
    )

    return annotated_frame