Spaces:

vu0018
/

MultipleHumanPose

Sleeping

App Files Files Community

vu0018 commited on Oct 1

Commit

c74bf35

verified ·

1 Parent(s): c5e8001

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -19

app.py CHANGED Viewed

@@ -7,6 +7,11 @@ import tempfile
 from transformers import pipeline
 from PIL import Image
 # Initialize MediaPipe Pose
 mp_pose = mp.solutions.pose
@@ -17,10 +22,6 @@ action_model = pipeline(
 )
 def detect_pose_and_activity(video_file):
-    """
-    Process the uploaded video to detect human poses and classify activity.
-    Video is limited to 10 seconds. Returns annotated video and predicted action.
-    """
     try:
         # Save uploaded video temporarily
         temp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
@@ -36,30 +37,48 @@ def detect_pose_and_activity(video_file):
             fps = 30  # fallback
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        max_frames = int(min(total_frames/fps, 10) * fps)  # limit 10s
         output_frames = []
         action_predictions = []
-        # Process frames
         with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5) as pose:
             for _ in range(max_frames):
                 ret, frame = cap.read()
                 if not ret:
                     break
-                # Pose detection
-                image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                results = pose.process(image_rgb)
-                if results.pose_landmarks:
-                    mp.solutions.drawing_utils.draw_landmarks(frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)
                 output_frames.append(frame)
-                # Convert frame to PIL image for Hugging Face model
-                pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-                pred = action_model(pil_image)
-                action_predictions.append(pred[0]['label'])
         cap.release()
@@ -67,7 +86,7 @@ def detect_pose_and_activity(video_file):
             return None, "Error: No frames to process."
         # Take the most frequent predicted action
-        action_label = max(set(action_predictions), key=action_predictions.count)
         # Save annotated video
         output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
@@ -87,8 +106,8 @@ iface = gr.Interface(
     fn=detect_pose_and_activity,
     inputs=gr.Video(label="Upload a Video (max 10s)"),
     outputs=[gr.Video(label="Pose Detection Output"), gr.Textbox(label="Detected Action")],
-    title="Human Pose & Activity Recognition",
-    description="Upload a short video (max 10s). The app detects human poses and predicts the activity (e.g., dancing, cycling, running)."
 )
-iface.launch()

 from transformers import pipeline
 from PIL import Image
+# Load YOLOv5 model from torch hub
+yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)
+yolo_model.conf = 0.4  # confidence threshold
+yolo_model.classes = [0]  # only detect persons (class 0)
 # Initialize MediaPipe Pose
 mp_pose = mp.solutions.pose
 )
 def detect_pose_and_activity(video_file):
     try:
         # Save uploaded video temporarily
         temp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
             fps = 30  # fallback
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        max_frames = int(min(total_frames / fps, 10) * fps)  # limit 10s
         output_frames = []
         action_predictions = []
         with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5) as pose:
             for _ in range(max_frames):
                 ret, frame = cap.read()
                 if not ret:
                     break
+                # Detect people using YOLOv5
+                results = yolo_model(frame)
+                detections = results.xyxy[0].cpu().numpy()
+                frame_actions = []
+                for det in detections:
+                    x1, y1, x2, y2, conf, cls = map(int, det[:6])
+                    person_crop = frame[y1:y2, x1:x2]
+                    # Pose estimation on cropped person
+                    person_rgb = cv2.cvtColor(person_crop, cv2.COLOR_BGR2RGB)
+                    pose_result = pose.process(person_rgb)
+                    if pose_result.pose_landmarks:
+                        mp.solutions.drawing_utils.draw_landmarks(
+                            person_crop, pose_result.pose_landmarks, mp_pose.POSE_CONNECTIONS
+                        )
+                    # Action recognition
+                    pil_image = Image.fromarray(cv2.cvtColor(person_crop, cv2.COLOR_BGR2RGB))
+                    pred = action_model(pil_image)
+                    frame_actions.append(pred[0]['label'])
+                    # Draw bounding box
+                    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                 output_frames.append(frame)
+                if frame_actions:
+                    action_predictions.append(max(set(frame_actions), key=frame_actions.count))
         cap.release()
             return None, "Error: No frames to process."
         # Take the most frequent predicted action
+        action_label = max(set(action_predictions), key=action_predictions.count) if action_predictions else "Unknown"
         # Save annotated video
         output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
     fn=detect_pose_and_activity,
     inputs=gr.Video(label="Upload a Video (max 10s)"),
     outputs=[gr.Video(label="Pose Detection Output"), gr.Textbox(label="Detected Action")],
+    title="Multi-Person Pose & Activity Recognition",
+    description="Upload a short video (max 10s). The app detects multiple people, estimates their poses, and predicts their actions."
 )
+iface.launch()