Spaces:

vu0018
/

my-pose-estimation-space

Sleeping

App Files Files Community

vu0018 commited on Sep 26

Commit

f5ffff9

verified ·

1 Parent(s): af17b79

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -28

app.py CHANGED Viewed

@@ -1,31 +1,32 @@
 import gradio as gr
 import cv2
-import mediapipe as mp
 import torch
 import numpy as np
 import tempfile
 # Initialize MediaPipe Pose
 mp_pose = mp.solutions.pose
-# Dummy ST-GCN++ model (replace with actual model)
-class SimpleSTGCNPlusPlus(torch.nn.Module):
-    def __init__(self, input_size=99, num_classes=5):  # 33 keypoints x 3 coords
-        super().__init__()
-        self.fc = torch.nn.Sequential(
-            torch.nn.Linear(input_size, 64),
-            torch.nn.ReLU(),
-            torch.nn.Linear(64, num_classes)
-        )
-    def forward(self, x):
-        return self.fc(x)
-# Instantiate the model
-model = SimpleSTGCNPlusPlus()
-labels = ["Ballet Dancing", "Cycling", "Running", "Jumping", "Walking"]
 def detect_pose_and_activity(video_file):
     try:
         # Save uploaded video to a temporary file
         temp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
@@ -33,17 +34,21 @@ def detect_pose_and_activity(video_file):
         temp_video.close()
         cap = cv2.VideoCapture(temp_video.name)
         fps = cap.get(cv2.CAP_PROP_FPS)
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        duration = total_frames / fps
-        max_frames = int(min(duration, 10) * fps)
         output_frames = []
         keypoints_sequence = []
         with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5) as pose:
-            frame_count = 0
-            while frame_count < max_frames:
                 ret, frame = cap.read()
                 if not ret:
                     break
@@ -51,30 +56,35 @@ def detect_pose_and_activity(video_file):
                 image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                 results = pose.process(image_rgb)
                 if results.pose_landmarks:
                     keypoints = []
                     for lm in results.pose_landmarks.landmark:
                         keypoints.extend([lm.x, lm.y, lm.z])
                     keypoints_sequence.append(keypoints)
                     mp.solutions.drawing_utils.draw_landmarks(frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)
                 else:
-                    keypoints_sequence.append([0] * 99)
                 output_frames.append(frame)
-                frame_count += 1
         cap.release()
-        if not keypoints_sequence:
-            return None, "No pose detected."
         keypoints_tensor = torch.tensor(keypoints_sequence, dtype=torch.float32).mean(dim=0, keepdim=True)
         with torch.no_grad():
-            preds = model(keypoints_tensor)
             action_idx = torch.argmax(preds, dim=1).item()
-            action_label = labels[action_idx]
         output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
         height, width, _ = output_frames[0].shape
         out = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
@@ -85,14 +95,15 @@ def detect_pose_and_activity(video_file):
         return output_file, f"Predicted Action: {action_label}"
     except Exception as e:
-        return None, f"Error during processing: {str(e)}"
 iface = gr.Interface(
     fn=detect_pose_and_activity,
     inputs=gr.Video(label="Upload a Video (max 10s)"),
     outputs=[gr.Video(label="Pose Detection Output"), gr.Textbox(label="Detected Action")],
     title="Human Pose & Activity Recognition",
-    description="Upload a short video, and this app will detect human poses and predict the activity (e.g., ballet, cycling)."
 )
 iface.launch()

 import gradio as gr
 import cv2
 import torch
 import numpy as np
 import tempfile
+from transformers import pipeline
+from PIL import Image
+import requests
+import mediapipe as mp
 # Initialize MediaPipe Pose
 mp_pose = mp.solutions.pose
+# Load Hugging Face models
+action_model = pipeline("image-classification", model="rvv-karma/Human-Action-Recognition-VIT-Base-patch16-224")
+pose_model = torch.hub.load("facebookresearch/ViTPose", "vitpose", pretrained=True)
+# Define action labels
+action_labels = [
+    "calling", "clapping", "cycling", "dancing", "drinking", "eating", "fighting", "hugging",
+    "laughing", "listening_to_music", "running", "sitting", "sleeping", "texting", "using_laptop"
+]
 def detect_pose_and_activity(video_file):
+    """
+    Process the uploaded video to detect human poses and classify the activity.
+    Video is trimmed to 10 seconds if longer.
+    Returns the annotated video and predicted activity label.
+    """
     try:
         # Save uploaded video to a temporary file
         temp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
         temp_video.close()
         cap = cv2.VideoCapture(temp_video.name)
+        if not cap.isOpened():
+            return None, "Error: Could not open video file. Please upload a valid mp4 video."
         fps = cap.get(cv2.CAP_PROP_FPS)
+        if fps == 0:
+            fps = 30  # fallback if fps is zero
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        max_frames = int(min(total_frames/fps, 10) * fps)  # limit to 10 seconds
         output_frames = []
         keypoints_sequence = []
         with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5) as pose:
+            for _ in range(max_frames):
                 ret, frame = cap.read()
                 if not ret:
                     break
                 image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                 results = pose.process(image_rgb)
+                # Extract keypoints
                 if results.pose_landmarks:
                     keypoints = []
                     for lm in results.pose_landmarks.landmark:
                         keypoints.extend([lm.x, lm.y, lm.z])
+                    if len(keypoints) != 99:
+                        keypoints = [0]*99
                     keypoints_sequence.append(keypoints)
                     mp.solutions.drawing_utils.draw_landmarks(frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)
                 else:
+                    keypoints_sequence.append([0]*99)
                 output_frames.append(frame)
         cap.release()
+        if len(keypoints_sequence) == 0 or len(output_frames) == 0:
+            return None, "Error: No frames or poses detected."
+        # Convert keypoints sequence to tensor
         keypoints_tensor = torch.tensor(keypoints_sequence, dtype=torch.float32).mean(dim=0, keepdim=True)
+        # Predict activity
         with torch.no_grad():
+            preds = pose_model(keypoints_tensor)
             action_idx = torch.argmax(preds, dim=1).item()
+            action_label = action_labels[action_idx]
+        # Save output video
         output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
         height, width, _ = output_frames[0].shape
         out = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
         return output_file, f"Predicted Action: {action_label}"
     except Exception as e:
+        return None, f"Runtime Error: {str(e)}"
+# Gradio Interface
 iface = gr.Interface(
     fn=detect_pose_and_activity,
     inputs=gr.Video(label="Upload a Video (max 10s)"),
     outputs=[gr.Video(label="Pose Detection Output"), gr.Textbox(label="Detected Action")],
     title="Human Pose & Activity Recognition",
+    description="Upload a short video (max 10s), and the app will detect human poses and predict the activity (e.g., ballet, cycling, running)."
 )
 iface.launch()