Spaces:

vu0018
/

MultipleHumanPose

Sleeping

File size: 2,878 Bytes

import gradio as gr
import cv2
import mediapipe as mp
import torch
import tempfile

# Load YOLOv5 model from torch hub
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True, trust_repo=True)
yolo_model.conf = 0.4  # confidence threshold
yolo_model.classes = [0]  # only detect persons

# Initialize MediaPipe Pose
mp_pose = mp.solutions.pose
mp_drawing = mp.solutions.drawing_utils

def detect_pose(video_file):
    try:
        temp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
        temp_video.write(open(video_file, "rb").read())
        temp_video.close()

        cap = cv2.VideoCapture(temp_video.name)
        if not cap.isOpened():
            return None, "Error: Could not open video."

        fps = cap.get(cv2.CAP_PROP_FPS) or 30
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        max_frames = int(min(total_frames / fps, 15) * fps)  # limit to 15s

        output_frames = []

        with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5) as pose:
            for _ in range(max_frames):
                ret, frame = cap.read()
                if not ret:
                    break

                results = yolo_model(frame)
                detections = results.xyxy[0].cpu().numpy()

                for det in detections:
                    x1, y1, x2, y2 = map(int, det[:4])
                    person_crop = frame[y1:y2, x1:x2]

                    person_rgb = cv2.cvtColor(person_crop, cv2.COLOR_BGR2RGB)
                    pose_result = pose.process(person_rgb)

                    if pose_result.pose_landmarks:
                        mp_drawing.draw_landmarks(
                            person_crop, pose_result.pose_landmarks, mp_pose.POSE_CONNECTIONS
                        )

                    # Draw bounding box
                    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

                output_frames.append(frame)

        cap.release()

        if not output_frames:
            return None, "Error: No frames processed."

        output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
        height, width, _ = output_frames[0].shape
        out = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
        for f in output_frames:
            out.write(f)
        out.release()

        return output_file, "Pose detection completed."

    except Exception as e:
        return None, f"Runtime Error: {str(e)}"

# Gradio Interface
iface = gr.Interface(
    fn=detect_pose,
    inputs=gr.Video(label="Upload a Video (max 10s)"),
    outputs=[gr.Video(label="Pose Detection Output"), gr.Textbox(label="Status")],
    title="Multi-Person Pose Detection",
    description="Upload a short video (max 15s). The app detects multiple people and estimates their poses."
)

iface.launch()