Spaces:

vu0018
/

my-pose-estimation-space

Sleeping

App Files Files Community

vu0018 commited on Sep 26

Commit

af17b79

verified ·

1 Parent(s): e24d81c

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -31

app.py CHANGED Viewed

@@ -1,39 +1,46 @@
 import gradio as gr
 import cv2
 import mediapipe as mp
 import tempfile
-import os
 # Initialize MediaPipe Pose
 mp_pose = mp.solutions.pose
-def detect_pose(video_file):
-    """
-    This function takes an uploaded video file, limits it to 10 seconds,
-    applies human pose estimation using MediaPipe, and returns a new video
-    with the detected poses drawn on the frames.
-    """
     try:
         # Save uploaded video to a temporary file
         temp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
         temp_video.write(open(video_file, "rb").read())
         temp_video.close()
-        # Open video using OpenCV
         cap = cv2.VideoCapture(temp_video.name)
-        if not cap.isOpened():
-            return "Error: Could not open video file."
         fps = cap.get(cv2.CAP_PROP_FPS)
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         duration = total_frames / fps
-        # Limit processing to max 10 seconds
         max_frames = int(min(duration, 10) * fps)
         output_frames = []
-        # Initialize MediaPipe Pose for pose detection
         with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5) as pose:
             frame_count = 0
             while frame_count < max_frames:
@@ -41,26 +48,33 @@ def detect_pose(video_file):
                 if not ret:
                     break
-                # Convert frame to RGB for MediaPipe
                 image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                 results = pose.process(image_rgb)
-                # Draw pose landmarks if detected
                 if results.pose_landmarks:
-                    mp.solutions.drawing_utils.draw_landmarks(
-                        frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS
-                    )
                 output_frames.append(frame)
                 frame_count += 1
         cap.release()
-        # Check if any frames were processed
-        if len(output_frames) == 0:
-            return "Error: No frames to process."
-        # Save output video
         output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
         height, width, _ = output_frames[0].shape
         out = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
@@ -68,19 +82,17 @@ def detect_pose(video_file):
             out.write(f)
         out.release()
-        return output_file
     except Exception as e:
-        # Catch any exceptions and return error message
-        return f"Error during processing: {str(e)}"
-# Gradio interface
 iface = gr.Interface(
-    fn=detect_pose,
     inputs=gr.Video(label="Upload a Video (max 10s)"),
-    outputs=gr.Video(label="Pose Detection Output"),
-    title="Human Pose Estimation",
-    description="Upload a short video, and this app will detect human poses (max 10 seconds)."
 )
 iface.launch()

 import gradio as gr
 import cv2
 import mediapipe as mp
+import torch
+import numpy as np
 import tempfile
 # Initialize MediaPipe Pose
 mp_pose = mp.solutions.pose
+# Dummy ST-GCN++ model (replace with actual model)
+class SimpleSTGCNPlusPlus(torch.nn.Module):
+    def __init__(self, input_size=99, num_classes=5):  # 33 keypoints x 3 coords
+        super().__init__()
+        self.fc = torch.nn.Sequential(
+            torch.nn.Linear(input_size, 64),
+            torch.nn.ReLU(),
+            torch.nn.Linear(64, num_classes)
+        )
+    def forward(self, x):
+        return self.fc(x)
+# Instantiate the model
+model = SimpleSTGCNPlusPlus()
+labels = ["Ballet Dancing", "Cycling", "Running", "Jumping", "Walking"]
+def detect_pose_and_activity(video_file):
     try:
         # Save uploaded video to a temporary file
         temp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
         temp_video.write(open(video_file, "rb").read())
         temp_video.close()
         cap = cv2.VideoCapture(temp_video.name)
         fps = cap.get(cv2.CAP_PROP_FPS)
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         duration = total_frames / fps
         max_frames = int(min(duration, 10) * fps)
         output_frames = []
+        keypoints_sequence = []
         with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5) as pose:
             frame_count = 0
             while frame_count < max_frames:
                 if not ret:
                     break
                 image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                 results = pose.process(image_rgb)
                 if results.pose_landmarks:
+                    keypoints = []
+                    for lm in results.pose_landmarks.landmark:
+                        keypoints.extend([lm.x, lm.y, lm.z])
+                    keypoints_sequence.append(keypoints)
+                    mp.solutions.drawing_utils.draw_landmarks(frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)
+                else:
+                    keypoints_sequence.append([0] * 99)
                 output_frames.append(frame)
                 frame_count += 1
         cap.release()
+        if not keypoints_sequence:
+            return None, "No pose detected."
+        keypoints_tensor = torch.tensor(keypoints_sequence, dtype=torch.float32).mean(dim=0, keepdim=True)
+        with torch.no_grad():
+            preds = model(keypoints_tensor)
+            action_idx = torch.argmax(preds, dim=1).item()
+            action_label = labels[action_idx]
         output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
         height, width, _ = output_frames[0].shape
         out = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
             out.write(f)
         out.release()
+        return output_file, f"Predicted Action: {action_label}"
     except Exception as e:
+        return None, f"Error during processing: {str(e)}"
 iface = gr.Interface(
+    fn=detect_pose_and_activity,
     inputs=gr.Video(label="Upload a Video (max 10s)"),
+    outputs=[gr.Video(label="Pose Detection Output"), gr.Textbox(label="Detected Action")],
+    title="Human Pose & Activity Recognition",
+    description="Upload a short video, and this app will detect human poses and predict the activity (e.g., ballet, cycling)."
 )
 iface.launch()