import tempfile from micro_gestures import * from composite_gestures import detect_composite_gesture # initialize mediapipe modules mp_hands = mp.solutions.hands mp_pose = mp.solutions.pose mp_drawing = mp.solutions.drawing_utils min_tracking_confidence=0.5 ) def process_video(video_path, target_width=640): # open video file cap = cv2.VideoCapture(video_path) fourcc = cv2.VideoWriter_fourcc(*'mp4v') temp_output = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") out = None sequence_buffer = [] # buffer for micro-gestures last_label = "None" # store last detected gesture label while cap.isOpened(): ret, frame = cap.read() break h0, w0 = frame.shape[:2] # resize frame keeping aspect ratio scale = target_width / float(w0) target_height = int(round(h0 * scale)) frame_small = cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_AREA) # convert to rgb rgb_small = cv2.cvtColor(frame_small, cv2.COLOR_BGR2RGB) # hand detection hand_results = hands.process(rgb_small) micro_label = "" if hand_results.multi_hand_landmarks: for hand_landmarks in hand_results.multi_hand_landmarks: mp_drawing.draw_landmarks(frame_small, hand_landmarks, mp_hands.HAND_CONNECTIONS) landmarks = [(lm.x, lm.y, lm.z) for lm in hand_landmarks.landmark] if fist_closed(landmarks): micro_label = "fist_closed" elif palm_open(landmarks): micro_label = "index_up" elif thumb_up(landmarks): micro_label = "thumb_up" if micro_label: sequence_buffer.append(micro_label) if len(sequence_buffer) > 5: sequence_buffer.pop(0) # detect composite gesture from micro-gesture sequence if sequence_buffer: detected = detect_composite_gesture(sequence_buffer) if detected: last_label = detected # initialize video writer if out is None: fps = cap.get(cv2.CAP_PROP_FPS) if fps <= 0 or fps > 120: (frame_small.shape[1], frame_small.shape[0]) ) # write processed frame out.write(frame_small) cap.release() if out: out.release() # return video path and last detected label return temp_output.name, last_label # gradio interface iface = gr.Interface( fn=process_video, inputs=[ gr.Video(label="Upload or Record Video"), gr.Slider(minimum=160, maximum=1280, value=640, step=20, label="Processing Width") ], outputs=[ gr.Video(label="Processed Video with Landmarks"), gr.Textbox(label="Detected Gesture", interactive=False) ], title="Hand & Body Pose Detection", description="Upload or record a video, see MediaPipe detect hand landmarks. Gesture label is shown below the video." ) if __name__ == "__main__":