File size: 3,134 Bytes
6e8f7ef
eaba712
 
4a85778
d0c8524
648ef5a
eb738c6
648ef5a
46760ce
 
 
d0c8524
 
eb738c6
648ef5a
 
 
4a85778
d0c8524
 
d5c1510
648ef5a
 
 
4a85778
648ef5a
d0c8524
648ef5a
 
 
d0c8524
 
648ef5a
4a85778
d0c8524
648ef5a
46760ce
648ef5a
 
d0c8524
 
4f45eb3
4759b08
4f45eb3
4759b08
4f45eb3
4759b08
d0c8524
eaba712
4759b08
 
 
 
d0c8524
eaba712
 
 
 
bca3a42
d0c8524
648ef5a
8615149
d5c1510
46760ce
 
648ef5a
d0c8524
bca3a42
4a85778
eb738c6
648ef5a
 
 
d0c8524
 
c8c8dca
 
d0c8524
542de99
d0c8524
648ef5a
 
 
 
bca3a42
46760ce
d0c8524
 
bca3a42
d0c8524
 
648ef5a
 
d0c8524
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import tempfile
from micro_gestures import *
from composite_gestures import detect_composite_gesture

# initialize mediapipe modules
mp_hands = mp.solutions.hands
mp_pose = mp.solutions.pose
mp_drawing = mp.solutions.drawing_utils
    min_tracking_confidence=0.5
)

def process_video(video_path, target_width=640):
    # open video file
    cap = cv2.VideoCapture(video_path)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    temp_output = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
    out = None

    sequence_buffer = []  # buffer for micro-gestures
    last_label = "None"  # store last detected gesture label

    while cap.isOpened():
        ret, frame = cap.read()
            break

        h0, w0 = frame.shape[:2]
        # resize frame keeping aspect ratio
        scale = target_width / float(w0)
        target_height = int(round(h0 * scale))
        frame_small = cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_AREA)

        # convert to rgb
        rgb_small = cv2.cvtColor(frame_small, cv2.COLOR_BGR2RGB)

        # hand detection
        hand_results = hands.process(rgb_small)
        micro_label = ""
        if hand_results.multi_hand_landmarks:
            for hand_landmarks in hand_results.multi_hand_landmarks:
                mp_drawing.draw_landmarks(frame_small, hand_landmarks, mp_hands.HAND_CONNECTIONS)        
                landmarks = [(lm.x, lm.y, lm.z) for lm in hand_landmarks.landmark]        
                if fist_closed(landmarks):
                    micro_label = "fist_closed"
                elif palm_open(landmarks):
                    micro_label = "index_up"
                elif thumb_up(landmarks):
                    micro_label = "thumb_up"
        
                if micro_label:
                    sequence_buffer.append(micro_label)
                    if len(sequence_buffer) > 5:
                        sequence_buffer.pop(0)

        # detect composite gesture from micro-gesture sequence
        if sequence_buffer:
            detected = detect_composite_gesture(sequence_buffer)
            if detected:
                last_label = detected

        # initialize video writer
        if out is None:
            fps = cap.get(cv2.CAP_PROP_FPS)
            if fps <= 0 or fps > 120:
                (frame_small.shape[1], frame_small.shape[0])
            )

        # write processed frame
        out.write(frame_small)

    cap.release()
    if out:
        out.release()

    # return video path and last detected label
    return temp_output.name, last_label


# gradio interface
iface = gr.Interface(
    fn=process_video,
    inputs=[
        gr.Video(label="Upload or Record Video"),
        gr.Slider(minimum=160, maximum=1280, value=640, step=20, label="Processing Width")
    ],
    outputs=[
        gr.Video(label="Processed Video with Landmarks"),
        gr.Textbox(label="Detected Gesture", interactive=False)

    ],
    title="Hand & Body Pose Detection",
    description="Upload or record a video, see MediaPipe detect hand landmarks. Gesture label is shown below the video."
)

if __name__ == "__main__":