Spaces:

vu0018
/

my-pose-estimation-space

Sleeping

App Files Files Community

vu0018 commited on Sep 26

Commit

9a13995

verified ·

1 Parent(s): c5fba2e

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -17

app.py CHANGED Viewed

@@ -19,7 +19,11 @@ action_model = pipeline(
 def detect_pose_and_activity(video_file):
     """
     Process the uploaded video to detect human poses and classify activity.
-    Video is limited to 10 seconds. Returns annotated video and predicted action.
     """
     try:
         # Save uploaded video temporarily
@@ -33,41 +37,53 @@ def detect_pose_and_activity(video_file):
         fps = cap.get(cv2.CAP_PROP_FPS)
         if fps == 0:
-            fps = 30  # fallback
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         max_frames = int(min(total_frames/fps, 10) * fps)  # limit 10s
         output_frames = []
-        action_predictions = []
-        # Process frames
-        with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5) as pose:
-            for _ in range(max_frames):
                 ret, frame = cap.read()
                 if not ret:
                     break
-                # Pose detection
-                image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                results = pose.process(image_rgb)
                 if results.pose_landmarks:
                     mp.solutions.drawing_utils.draw_landmarks(frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)
                 output_frames.append(frame)
-                # Convert frame to PIL image for Hugging Face model
-                pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-                pred = action_model(pil_image)
-                action_predictions.append(pred[0]['label'])
         cap.release()
         if len(output_frames) == 0:
             return None, "Error: No frames to process."
         # Take the most frequent predicted action
-        action_label = max(set(action_predictions), key=action_predictions.count)
         # Save annotated video
         output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
@@ -77,7 +93,7 @@ def detect_pose_and_activity(video_file):
             out.write(f)
         out.release()
-        return output_file, f"Predicted Action: {action_label}"
     except Exception as e:
         return None, f"Runtime Error: {str(e)}"
@@ -87,8 +103,8 @@ iface = gr.Interface(
     fn=detect_pose_and_activity,
     inputs=gr.Video(label="Upload a Video (max 10s)"),
     outputs=[gr.Video(label="Pose Detection Output"), gr.Textbox(label="Detected Action")],
-    title="Human Pose & Activity Recognition",
-    description="Upload a short video (max 10s). The app detects human poses and predicts the activity (e.g., dancing, cycling, running)."
 )
 iface.launch()

 def detect_pose_and_activity(video_file):
     """
     Process the uploaded video to detect human poses and classify activity.
+    Optimizations:
+      - Skip frames
+      - Resize frames
+      - Batch action prediction
+    Returns annotated video and predicted action.
     """
     try:
         # Save uploaded video temporarily
         fps = cap.get(cv2.CAP_PROP_FPS)
         if fps == 0:
+            fps = 30
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         max_frames = int(min(total_frames/fps, 10) * fps)  # limit 10s
         output_frames = []
+        pil_frames_for_model = []
+        frame_skip = 2  # process every 2nd frame
+        target_size = (224, 224)  # Resize for faster inference
+        with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5) as pose:
+            frame_index = 0
+            while frame_index < max_frames:
                 ret, frame = cap.read()
                 if not ret:
                     break
+                # Resize frame for speed
+                frame_small = cv2.resize(frame, target_size)
+                image_rgb = cv2.cvtColor(frame_small, cv2.COLOR_BGR2RGB)
+                # Pose detection on full frame
+                results = pose.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                 if results.pose_landmarks:
                     mp.solutions.drawing_utils.draw_landmarks(frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)
                 output_frames.append(frame)
+                # Only process every Nth frame for action prediction
+                if frame_index % frame_skip == 0:
+                    pil_image = Image.fromarray(image_rgb)
+                    pil_frames_for_model.append(pil_image)
+                frame_index += 1
         cap.release()
         if len(output_frames) == 0:
             return None, "Error: No frames to process."
+        # Batch prediction
+        preds = action_model(pil_frames_for_model)
+        action_labels = [pred['label'] for pred in preds]
         # Take the most frequent predicted action
+        final_action = max(set(action_labels), key=action_labels.count)
         # Save annotated video
         output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
             out.write(f)
         out.release()
+        return output_file, f"Predicted Action: {final_action}"
     except Exception as e:
         return None, f"Runtime Error: {str(e)}"
     fn=detect_pose_and_activity,
     inputs=gr.Video(label="Upload a Video (max 10s)"),
     outputs=[gr.Video(label="Pose Detection Output"), gr.Textbox(label="Detected Action")],
+    title="Human Pose & Activity Recognition (Optimized)",
+    description="Upload a short video (max 10s). The app detects human poses and predicts the activity quickly using frame skipping, resizing, and batch predictions."
 )
 iface.launch()