Spaces:

vu0018
/

my-pose-estimation-space

Sleeping

App Files Files Community

vu0018 commited on Sep 26

Commit

c5fba2e

verified ·

1 Parent(s): 8fcf823

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -41

app.py CHANGED Viewed

@@ -1,90 +1,75 @@
 import gradio as gr
 import cv2
 import torch
 import numpy as np
 import tempfile
 from transformers import pipeline
 from PIL import Image
-import requests
-import mediapipe as mp
 # Initialize MediaPipe Pose
 mp_pose = mp.solutions.pose
-# Load Hugging Face models
-action_model = pipeline("image-classification", model="rvv-karma/Human-Action-Recognition-VIT-Base-patch16-224")
-pose_model = torch.hub.load("facebookresearch/ViTPose", "vitpose", pretrained=True)
-# Define action labels
-action_labels = [
-    "calling", "clapping", "cycling", "dancing", "drinking", "eating", "fighting", "hugging",
-    "laughing", "listening_to_music", "running", "sitting", "sleeping", "texting", "using_laptop"
-]
 def detect_pose_and_activity(video_file):
     """
-    Process the uploaded video to detect human poses and classify the activity.
-    Video is trimmed to 10 seconds if longer.
-    Returns the annotated video and predicted activity label.
     """
     try:
-        # Save uploaded video to a temporary file
         temp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
         temp_video.write(open(video_file, "rb").read())
         temp_video.close()
         cap = cv2.VideoCapture(temp_video.name)
         if not cap.isOpened():
-            return None, "Error: Could not open video file. Please upload a valid mp4 video."
         fps = cap.get(cv2.CAP_PROP_FPS)
         if fps == 0:
-            fps = 30  # fallback if fps is zero
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        max_frames = int(min(total_frames/fps, 10) * fps)  # limit to 10 seconds
         output_frames = []
-        keypoints_sequence = []
         with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5) as pose:
             for _ in range(max_frames):
                 ret, frame = cap.read()
                 if not ret:
                     break
                 image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                 results = pose.process(image_rgb)
-                # Extract keypoints
                 if results.pose_landmarks:
-                    keypoints = []
-                    for lm in results.pose_landmarks.landmark:
-                        keypoints.extend([lm.x, lm.y, lm.z])
-                    if len(keypoints) != 99:
-                        keypoints = [0]*99
-                    keypoints_sequence.append(keypoints)
                     mp.solutions.drawing_utils.draw_landmarks(frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)
-                else:
-                    keypoints_sequence.append([0]*99)
                 output_frames.append(frame)
-        cap.release()
-        if len(keypoints_sequence) == 0 or len(output_frames) == 0:
-            return None, "Error: No frames or poses detected."
-        # Convert keypoints sequence to tensor
-        keypoints_tensor = torch.tensor(keypoints_sequence, dtype=torch.float32).mean(dim=0, keepdim=True)
-        # Predict activity
-        with torch.no_grad():
-            preds = pose_model(keypoints_tensor)
-            action_idx = torch.argmax(preds, dim=1).item()
-            action_label = action_labels[action_idx]
-        # Save output video
         output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
         height, width, _ = output_frames[0].shape
         out = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
@@ -103,7 +88,7 @@ iface = gr.Interface(
     inputs=gr.Video(label="Upload a Video (max 10s)"),
     outputs=[gr.Video(label="Pose Detection Output"), gr.Textbox(label="Detected Action")],
     title="Human Pose & Activity Recognition",
-    description="Upload a short video (max 10s), and the app will detect human poses and predict the activity (e.g., ballet, cycling, running)."
 )
 iface.launch()

 import gradio as gr
 import cv2
+import mediapipe as mp
 import torch
 import numpy as np
 import tempfile
 from transformers import pipeline
 from PIL import Image
 # Initialize MediaPipe Pose
 mp_pose = mp.solutions.pose
+# Hugging Face pretrained model for action recognition
+action_model = pipeline(
+    "image-classification",
+    model="rvv-karma/Human-Action-Recognition-VIT-Base-patch16-224"
+)
 def detect_pose_and_activity(video_file):
     """
+    Process the uploaded video to detect human poses and classify activity.
+    Video is limited to 10 seconds. Returns annotated video and predicted action.
     """
     try:
+        # Save uploaded video temporarily
         temp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
         temp_video.write(open(video_file, "rb").read())
         temp_video.close()
         cap = cv2.VideoCapture(temp_video.name)
         if not cap.isOpened():
+            return None, "Error: Could not open video."
         fps = cap.get(cv2.CAP_PROP_FPS)
         if fps == 0:
+            fps = 30  # fallback
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        max_frames = int(min(total_frames/fps, 10) * fps)  # limit 10s
         output_frames = []
+        action_predictions = []
+        # Process frames
         with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5) as pose:
             for _ in range(max_frames):
                 ret, frame = cap.read()
                 if not ret:
                     break
+                # Pose detection
                 image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                 results = pose.process(image_rgb)
                 if results.pose_landmarks:
                     mp.solutions.drawing_utils.draw_landmarks(frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)
                 output_frames.append(frame)
+                # Convert frame to PIL image for Hugging Face model
+                pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+                pred = action_model(pil_image)
+                action_predictions.append(pred[0]['label'])
+        cap.release()
+        if len(output_frames) == 0:
+            return None, "Error: No frames to process."
+        # Take the most frequent predicted action
+        action_label = max(set(action_predictions), key=action_predictions.count)
+        # Save annotated video
         output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
         height, width, _ = output_frames[0].shape
         out = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
     inputs=gr.Video(label="Upload a Video (max 10s)"),
     outputs=[gr.Video(label="Pose Detection Output"), gr.Textbox(label="Detected Action")],
     title="Human Pose & Activity Recognition",
+    description="Upload a short video (max 10s). The app detects human poses and predicts the activity (e.g., dancing, cycling, running)."
 )
 iface.launch()