Spaces:

nouvellevision
/

VSL-VideoMAE

Build error

App Files Files Community

tanthinhdt commited on Apr 18, 2024

Commit

b808ec1

verified ·

1 Parent(s): d13d732

feat: add utils for inference

Browse files

Files changed (1) hide show

utils.py +190 -0

utils.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import cv2
+import numpy as np
+import torch
+from mediapipe.python.solutions import (drawing_styles, drawing_utils,
+                                        holistic, pose)
+from torchvision.transforms.v2 import Compose, UniformTemporalSubsample
+def draw_skeleton_on_image(
+    image: np.ndarray,
+    detection_results,
+    resize_to: tuple[int, int] = None,
+) -> np.ndarray:
+    """
+    Draw skeleton on the image.
+    Parameters
+    ----------
+    image : np.ndarray
+        Image to draw skeleton on.
+    detection_results
+        Detection results.
+    resize_to : tuple[int, int], optional
+        Resize the image to the specified size.
+    Returns
+    -------
+    np.ndarray
+        Annotated image with skeleton.
+    """
+    annotated_image = np.copy(image)
+    # Draw pose connections
+    drawing_utils.draw_landmarks(
+        annotated_image,
+        detection_results.pose_landmarks,
+        holistic.POSE_CONNECTIONS,
+        landmark_drawing_spec=drawing_styles.get_default_pose_landmarks_style(),
+    )
+    # Draw left hand connections
+    drawing_utils.draw_landmarks(
+        annotated_image,
+        detection_results.left_hand_landmarks,
+        holistic.HAND_CONNECTIONS,
+        drawing_utils.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4),
+        drawing_utils.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2),
+    )
+    # Draw right hand connections
+    drawing_utils.draw_landmarks(
+        annotated_image,
+        detection_results.right_hand_landmarks,
+        holistic.HAND_CONNECTIONS,
+        drawing_utils.DrawingSpec(color=(245, 117, 66), thickness=2, circle_radius=4),
+        drawing_utils.DrawingSpec(color=(245, 66, 230), thickness=2, circle_radius=2),
+    )
+    if resize_to is not None:
+        annotated_image = cv2.resize(
+            annotated_image,
+            resize_to,
+            interpolation=cv2.INTER_AREA,
+        )
+    return annotated_image
+def are_hands_down(pose_landmarks: list) -> bool:
+    """
+    Check if the hand is down.
+    Parameters
+    ----------
+    hand_landmarks : list
+        Hand landmarks.
+    Returns
+    -------
+    bool
+        True if the hand is down, False otherwise.
+    """
+    if pose_landmarks is None:
+        return True
+    landmarks = pose_landmarks.landmark
+    left_elbow = [
+        landmarks[pose.PoseLandmark.LEFT_ELBOW.value].x,
+        landmarks[pose.PoseLandmark.LEFT_ELBOW.value].y,
+        landmarks[pose.PoseLandmark.LEFT_SHOULDER.value].visibility,
+    ]
+    left_wrist = [
+        landmarks[pose.PoseLandmark.LEFT_WRIST.value].x,
+        landmarks[pose.PoseLandmark.LEFT_WRIST.value].y,
+        landmarks[pose.PoseLandmark.LEFT_SHOULDER.value].visibility,
+    ]
+    right_elbow = [
+        landmarks[pose.PoseLandmark.RIGHT_ELBOW.value].x,
+        landmarks[pose.PoseLandmark.RIGHT_ELBOW.value].y,
+        landmarks[pose.PoseLandmark.RIGHT_SHOULDER.value].visibility,
+    ]
+    right_wrist = [
+        landmarks[pose.PoseLandmark.RIGHT_WRIST.value].x,
+        landmarks[pose.PoseLandmark.RIGHT_WRIST.value].y,
+        landmarks[pose.PoseLandmark.RIGHT_SHOULDER.value].visibility,
+    ]
+    is_visible = all(
+        [left_elbow[2] > 0, left_wrist[2] > 0, right_elbow[2] > 0, right_wrist[2] > 0]
+    )
+    return is_visible and left_wrist[1] > left_elbow[1] and right_wrist[1] > right_elbow[1]
+def get_predictions(
+    inputs: dict,
+    model,
+    k: int = 3,
+) -> list:
+    if inputs is None:
+        return []
+    outputs = model(**inputs)
+    logits = outputs.logits
+    # Get top-3 predictions
+    topk_scores, topk_indices = torch.topk(logits, k, dim=1)
+    topk_scores = torch.nn.functional.softmax(topk_scores, dim=1).squeeze().cpu().numpy()
+    topk_indices = topk_indices.squeeze().cpu().numpy()
+    return [
+        {
+            'label': model.config.id2label[topk_indices[i]],
+            'score': topk_scores[i],
+        }
+        for i in range(k)
+    ]
+def preprocess(
+    model_num_frames: int,
+    keypoints_detector,
+    source: str,
+    data_height: int,
+    data_width: int,
+    model_input_height: int,
+    model_input_width: int,
+    device: str,
+    transform: Compose,
+) -> dict:
+    skeleton_video = []
+    did_sample_start = False
+    cap = cv2.VideoCapture(source)
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        # Detect keypoints.
+        detection_results = keypoints_detector.process(frame)
+        skeleton_frame = draw_skeleton_on_image(
+            image=np.zeros((data_height, data_width, 3), dtype=np.uint8),
+            detection_results=detection_results,
+            resize_to=(model_input_height, model_input_width),
+        )
+        # (height, width, channels) -> (channels, height, width)
+        skeleton_frame = transform(torch.tensor(skeleton_frame).permute(2, 0, 1))
+        # Extract sign video.
+        if not are_hands_down(detection_results.pose_landmarks):
+            if not did_sample_start:
+                did_sample_start = True
+        elif did_sample_start:
+            break
+        if did_sample_start:
+            skeleton_video.append(skeleton_frame)
+    cap.release()
+    if len(skeleton_video) < model_num_frames:
+        return None
+    skeleton_video = torch.stack(skeleton_video)
+    skeleton_video = UniformTemporalSubsample(model_num_frames)(skeleton_video)
+    inputs = {
+        'pixel_values': skeleton_video.unsqueeze(0),
+    }
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    return inputs