Spaces:

vu0018
/

my-pose-estimation-space

Sleeping

App Files Files Community

vu0018 commited on Sep 26

Commit

7f28013

verified ·

1 Parent(s): 9a13995

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -34

app.py CHANGED Viewed

@@ -19,11 +19,7 @@ action_model = pipeline(
 def detect_pose_and_activity(video_file):
     """
     Process the uploaded video to detect human poses and classify activity.
-    Optimizations:
-      - Skip frames
-      - Resize frames
-      - Batch action prediction
-    Returns annotated video and predicted action.
     """
     try:
         # Save uploaded video temporarily
@@ -37,53 +33,41 @@ def detect_pose_and_activity(video_file):
         fps = cap.get(cv2.CAP_PROP_FPS)
         if fps == 0:
-            fps = 30
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         max_frames = int(min(total_frames/fps, 10) * fps)  # limit 10s
         output_frames = []
-        pil_frames_for_model = []
-        frame_skip = 2  # process every 2nd frame
-        target_size = (224, 224)  # Resize for faster inference
-        with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5) as pose:
-            frame_index = 0
-            while frame_index < max_frames:
                 ret, frame = cap.read()
                 if not ret:
                     break
-                # Resize frame for speed
-                frame_small = cv2.resize(frame, target_size)
-                image_rgb = cv2.cvtColor(frame_small, cv2.COLOR_BGR2RGB)
-                # Pose detection on full frame
-                results = pose.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                 if results.pose_landmarks:
                     mp.solutions.drawing_utils.draw_landmarks(frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)
                 output_frames.append(frame)
-                # Only process every Nth frame for action prediction
-                if frame_index % frame_skip == 0:
-                    pil_image = Image.fromarray(image_rgb)
-                    pil_frames_for_model.append(pil_image)
-                frame_index += 1
         cap.release()
         if len(output_frames) == 0:
             return None, "Error: No frames to process."
-        # Batch prediction
-        preds = action_model(pil_frames_for_model)
-        action_labels = [pred['label'] for pred in preds]
         # Take the most frequent predicted action
-        final_action = max(set(action_labels), key=action_labels.count)
         # Save annotated video
         output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
@@ -93,7 +77,7 @@ def detect_pose_and_activity(video_file):
             out.write(f)
         out.release()
-        return output_file, f"Predicted Action: {final_action}"
     except Exception as e:
         return None, f"Runtime Error: {str(e)}"
@@ -103,8 +87,8 @@ iface = gr.Interface(
     fn=detect_pose_and_activity,
     inputs=gr.Video(label="Upload a Video (max 10s)"),
     outputs=[gr.Video(label="Pose Detection Output"), gr.Textbox(label="Detected Action")],
-    title="Human Pose & Activity Recognition (Optimized)",
-    description="Upload a short video (max 10s). The app detects human poses and predicts the activity quickly using frame skipping, resizing, and batch predictions."
 )
-iface.launch()

 def detect_pose_and_activity(video_file):
     """
     Process the uploaded video to detect human poses and classify activity.
+    Video is limited to 10 seconds. Returns annotated video and predicted action.
     """
     try:
         # Save uploaded video temporarily
         fps = cap.get(cv2.CAP_PROP_FPS)
         if fps == 0:
+            fps = 30  # fallback
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         max_frames = int(min(total_frames/fps, 10) * fps)  # limit 10s
         output_frames = []
+        action_predictions = []
+        # Process frames
+        with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5) as pose:
+            for _ in range(max_frames):
                 ret, frame = cap.read()
                 if not ret:
                     break
+                # Pose detection
+                image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                results = pose.process(image_rgb)
                 if results.pose_landmarks:
                     mp.solutions.drawing_utils.draw_landmarks(frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)
                 output_frames.append(frame)
+                # Convert frame to PIL image for Hugging Face model
+                pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+                pred = action_model(pil_image)
+                action_predictions.append(pred[0]['label'])
         cap.release()
         if len(output_frames) == 0:
             return None, "Error: No frames to process."
         # Take the most frequent predicted action
+        action_label = max(set(action_predictions), key=action_predictions.count)
         # Save annotated video
         output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
             out.write(f)
         out.release()
+        return output_file, f"Predicted Action: {action_label}"
     except Exception as e:
         return None, f"Runtime Error: {str(e)}"
     fn=detect_pose_and_activity,
     inputs=gr.Video(label="Upload a Video (max 10s)"),
     outputs=[gr.Video(label="Pose Detection Output"), gr.Textbox(label="Detected Action")],
+    title="Human Pose & Activity Recognition",
+    description="Upload a short video (max 10s). The app detects human poses and predicts the activity (e.g., dancing, cycling, running)."
 )
+iface.launch()