File size: 3,298 Bytes
5322ae1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
Preprocessing utilities for MediaPipe landmark extraction
"""
import cv2
import numpy as np
import mediapipe as mp
from typing import Optional, Tuple


# Initialize MediaPipe
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils


def mediapipe_detection(image: np.ndarray, model) -> Tuple[np.ndarray, object]:
    """
    Process image with MediaPipe Holistic model.

    Args:
        image: Input frame (BGR format)
        model: MediaPipe Holistic model instance

    Returns:
        Processed image and detection results
    """
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results


def extract_keypoints(results) -> np.ndarray:
    """
    Extract keypoints from MediaPipe detection results.

    Features extracted:
    - Pose landmarks: 33 points × 4 values (x, y, z, visibility) = 132 features
    - Left hand landmarks: 21 points × 3 values (x, y, z) = 63 features
    - Right hand landmarks: 21 points × 3 values (x, y, z) = 63 features
    Total: 258 features

    Args:
        results: MediaPipe detection results

    Returns:
        Flattened array of 258 features
    """
    # Extract pose landmarks (33 points × 4 features = 132)
    pose = np.array([
        [res.x, res.y, res.z, res.visibility]
        for res in results.pose_landmarks.landmark
    ]).flatten() if results.pose_landmarks else np.zeros(33 * 4)

    # Extract left hand landmarks (21 points × 3 features = 63)
    lh = np.array([
        [res.x, res.y, res.z]
        for res in results.left_hand_landmarks.landmark
    ]).flatten() if results.left_hand_landmarks else np.zeros(21 * 3)

    # Extract right hand landmarks (21 points × 3 features = 63)
    rh = np.array([
        [res.x, res.y, res.z]
        for res in results.right_hand_landmarks.landmark
    ]).flatten() if results.right_hand_landmarks else np.zeros(21 * 3)

    return np.concatenate([pose, lh, rh])


def process_frame(frame: np.ndarray, holistic_model) -> Optional[np.ndarray]:
    """
    Process a single frame and extract keypoints.

    Args:
        frame: Input frame (BGR format)
        holistic_model: MediaPipe Holistic model instance

    Returns:
        Keypoints array (258 features) or None if no hands detected
    """
    _, results = mediapipe_detection(frame, holistic_model)

    # Only process if at least one hand is detected
    if results.left_hand_landmarks or results.right_hand_landmarks:
        keypoints = extract_keypoints(results)
        return keypoints

    return None


def decode_base64_image(base64_string: str) -> np.ndarray:
    """
    Decode base64 string to numpy array (image).

    Args:
        base64_string: Base64 encoded image

    Returns:
        Decoded image as numpy array
    """
    import base64

    # Remove data URL prefix if present
    if ',' in base64_string:
        base64_string = base64_string.split(',')[1]

    # Decode base64 to bytes
    img_bytes = base64.b64decode(base64_string)

    # Convert bytes to numpy array
    nparr = np.frombuffer(img_bytes, np.uint8)

    # Decode image
    img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)

    return img