Spaces:

vu0018
/

MultipleHumanPose

Sleeping

App Files Files Community

vu0018 commited on Oct 1

Commit

8cac824

verified ·

1 Parent(s): 21fd24d

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -43

app.py CHANGED Viewed

@@ -2,28 +2,19 @@ import gradio as gr
 import cv2
 import mediapipe as mp
 import torch
-import numpy as np
 import tempfile
-from transformers import pipeline
-from PIL import Image
 # Load YOLOv5 model from torch hub
 yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True, trust_repo=True)
 yolo_model.conf = 0.4  # confidence threshold
-yolo_model.classes = [0]  # only detect persons (class 0)
 # Initialize MediaPipe Pose
 mp_pose = mp.solutions.pose
-# Hugging Face pretrained model for action recognition
-action_model = pipeline(
-    "image-classification",
-    model="rvv-karma/Human-Action-Recognition-VIT-Base-patch16-224"
-)
-def detect_pose_and_activity(video_file):
     try:
-        # Save uploaded video temporarily
         temp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
         temp_video.write(open(video_file, "rb").read())
         temp_video.close()
@@ -32,15 +23,11 @@ def detect_pose_and_activity(video_file):
         if not cap.isOpened():
             return None, "Error: Could not open video."
-        fps = cap.get(cv2.CAP_PROP_FPS)
-        if fps == 0:
-            fps = 30  # fallback
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        max_frames = int(min(total_frames / fps, 10) * fps)  # limit 10s
         output_frames = []
-        action_predictions = []
         with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5) as pose:
             for _ in range(max_frames):
@@ -48,47 +35,31 @@ def detect_pose_and_activity(video_file):
                 if not ret:
                     break
-                # Detect people using YOLOv5
                 results = yolo_model(frame)
                 detections = results.xyxy[0].cpu().numpy()
-                frame_actions = []
                 for det in detections:
-                    x1, y1, x2, y2, conf, cls = map(int, det[:6])
                     person_crop = frame[y1:y2, x1:x2]
-                    # Pose estimation on cropped person
                     person_rgb = cv2.cvtColor(person_crop, cv2.COLOR_BGR2RGB)
                     pose_result = pose.process(person_rgb)
                     if pose_result.pose_landmarks:
-                        mp.solutions.drawing_utils.draw_landmarks(
                             person_crop, pose_result.pose_landmarks, mp_pose.POSE_CONNECTIONS
                         )
-                    # Action recognition
-                    pil_image = Image.fromarray(cv2.cvtColor(person_crop, cv2.COLOR_BGR2RGB))
-                    pred = action_model(pil_image)
-                    frame_actions.append(pred[0]['label'])
                     # Draw bounding box
                     cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                 output_frames.append(frame)
-                if frame_actions:
-                    action_predictions.append(max(set(frame_actions), key=frame_actions.count))
         cap.release()
-        if len(output_frames) == 0:
-            return None, "Error: No frames to process."
-        # Take the most frequent predicted action
-        action_label = max(set(action_predictions), key=action_predictions.count) if action_predictions else "Unknown"
-        # Save annotated video
         output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
         height, width, _ = output_frames[0].shape
         out = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
@@ -96,18 +67,18 @@ def detect_pose_and_activity(video_file):
             out.write(f)
         out.release()
-        return output_file, f"Predicted Action: {action_label}"
     except Exception as e:
         return None, f"Runtime Error: {str(e)}"
 # Gradio Interface
 iface = gr.Interface(
-    fn=detect_pose_and_activity,
     inputs=gr.Video(label="Upload a Video (max 10s)"),
-    outputs=[gr.Video(label="Pose Multiple Detection Output"), gr.Textbox(label="Detected Pose")],
-    title="Multi-Person Pose & Activity Recognition",
-    description="Upload a short video (max 10s). The app detects multiple people, estimates their poses, and predicts their actions."
 )
 iface.launch()

 import cv2
 import mediapipe as mp
 import torch
 import tempfile
 # Load YOLOv5 model from torch hub
 yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True, trust_repo=True)
 yolo_model.conf = 0.4  # confidence threshold
+yolo_model.classes = [0]  # only detect persons
 # Initialize MediaPipe Pose
 mp_pose = mp.solutions.pose
+mp_drawing = mp.solutions.drawing_utils
+def detect_pose(video_file):
     try:
         temp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
         temp_video.write(open(video_file, "rb").read())
         temp_video.close()
         if not cap.isOpened():
             return None, "Error: Could not open video."
+        fps = cap.get(cv2.CAP_PROP_FPS) or 30
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        max_frames = int(min(total_frames / fps, 10) * fps)  # limit to 10s
         output_frames = []
         with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5) as pose:
             for _ in range(max_frames):
                 if not ret:
                     break
                 results = yolo_model(frame)
                 detections = results.xyxy[0].cpu().numpy()
                 for det in detections:
+                    x1, y1, x2, y2 = map(int, det[:4])
                     person_crop = frame[y1:y2, x1:x2]
                     person_rgb = cv2.cvtColor(person_crop, cv2.COLOR_BGR2RGB)
                     pose_result = pose.process(person_rgb)
                     if pose_result.pose_landmarks:
+                        mp_drawing.draw_landmarks(
                             person_crop, pose_result.pose_landmarks, mp_pose.POSE_CONNECTIONS
                         )
                     # Draw bounding box
                     cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                 output_frames.append(frame)
         cap.release()
+        if not output_frames:
+            return None, "Error: No frames processed."
         output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
         height, width, _ = output_frames[0].shape
         out = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
             out.write(f)
         out.release()
+        return output_file, "Pose detection completed."
     except Exception as e:
         return None, f"Runtime Error: {str(e)}"
 # Gradio Interface
 iface = gr.Interface(
+    fn=detect_pose,
     inputs=gr.Video(label="Upload a Video (max 10s)"),
+    outputs=[gr.Video(label="Pose Detection Output"), gr.Textbox(label="Status")],
+    title="Multi-Person Pose Detection",
+    description="Upload a short video (max 10s). The app detects multiple people and estimates their poses."
 )
 iface.launch()