Spaces:

Agents-MCP-Hackathon
/

Laban-Movement-Analysis

Sleeping

App Files Files Community

BladeSzaSza commited on Jun 5, 2025

Commit

c21af4b

1 Parent(s): 746cfcf

yolo fix

Browse files

Files changed (5) hide show

backend/gradio_labanmovementanalysis/pose_estimation.py +86 -55
examples/mediapipe.json +0 -0
examples/movenet.json +0 -0
examples/yolov11.json +0 -0
examples/yolov8.json +0 -0

backend/gradio_labanmovementanalysis/pose_estimation.py CHANGED Viewed

@@ -146,10 +146,10 @@ class MoveNetPoseEstimator(PoseEstimator):
                 confidence=float(score),
                 name=self.KEYPOINT_NAMES[i]
             ))
-        if keypoints:
             return [PoseResult(keypoints=keypoints, frame_index=0)]
         else:
-            return []
     def get_keypoint_names(self) -> List[str]:
         return self.KEYPOINT_NAMES.copy()
@@ -190,10 +190,10 @@ class MediaPipePoseEstimator(PoseEstimator):
             import mediapipe as mp
             self.mp_pose = mp.solutions.pose
             self.pose = self.mp_pose.Pose(
-                static_image_mode=False,
                 model_complexity=self.model_complexity,
                 min_detection_confidence=self.min_detection_confidence,
-                min_tracking_confidence=0.5
             )
         except ImportError:
             raise ImportError("MediaPipe required. Install with: pip install mediapipe")
@@ -219,10 +219,11 @@ class MediaPipePoseEstimator(PoseEstimator):
             keypoints.append(Keypoint(
                 x=landmark.x,
                 y=landmark.y,
-                confidence=landmark.visibility if hasattr(landmark, 'visibility') else 1.0,
                 name=self.LANDMARK_NAMES[i] if i < len(self.LANDMARK_NAMES) else f"landmark_{i}"
             ))
         return [PoseResult(keypoints=keypoints, frame_index=0)]
     def get_keypoint_names(self) -> List[str]:
@@ -250,20 +251,22 @@ class YOLOPoseEstimator(PoseEstimator):
         Initialize YOLO pose model.
         Args:
-            model_version: "v8" or "v11"
             model_size: Model size - "n" (nano), "s" (small), "m" (medium), "l" (large), "x" (xlarge)
-            confidence_threshold: Minimum confidence for detections
         """
-        self.model_version = model_version
-        self.model_size = model_size
-        self.confidence_threshold = confidence_threshold
         self.model = None
         # Determine model path
-        if model_version == "v8":
-            self.model_path = f"yolov8{model_size}-pose.pt"
-        else:  # v11
-            self.model_path = f"yolo11{model_size}-pose.pt"
         self._load_model()
@@ -282,34 +285,53 @@ class YOLOPoseEstimator(PoseEstimator):
             self._load_model()
         # Run inference
-        results = self.model(frame, conf=self.confidence_threshold, iou=0.7)
         pose_results = []
-        # Process each detection
-        for r in results:
-            if r.keypoints is not None:
-                for person_idx, keypoints_data in enumerate(r.keypoints.data):
-                    keypoints = []
-                    # YOLO returns keypoints as [x, y, conf]
-                    height, width = frame.shape[:2]
-                    for i, (x, y, conf) in enumerate(keypoints_data):
                         # Sanitize NaN values
-                        if any(map(math.isnan, [x, y, conf])):
                             continue
                         keypoints.append(Keypoint(
-                            x=float(x) / width,  # Normalize to 0-1
-                            y=float(y) / height,  # Normalize to 0-1
-                            confidence=float(conf),
                             name=self.KEYPOINT_NAMES[i] if i < len(self.KEYPOINT_NAMES) else f"joint_{i}"
                         ))
                     if keypoints:
                         pose_results.append(PoseResult(
                             keypoints=keypoints,
-                            frame_index=0,
-                            person_id=person_idx
                         ))
         return pose_results
@@ -376,18 +398,42 @@ def get_pose_estimator(model_spec: str) -> PoseEstimator:
     # MoveNet variants
     elif model_spec.startswith("movenet"):
         variant = "lightning" if "lightning" in model_spec else "thunder"
         return create_pose_estimator("movenet", model_variant=variant)
     # YOLO variants
     elif model_spec.startswith("yolo"):
         parts = model_spec.split("-")
-        version = "v8" if "v8" in model_spec else "v11"
-        size = parts[-1] if len(parts) > 2 else "n"
         return create_pose_estimator("yolo", model_version=version, model_size=size)
-    # Legacy format support
     else:
-        return create_pose_estimator(model_spec)
 def _safe_pose_from_dets(dets: List[PoseResult], frame_idx: int) -> List[PoseResult]:
@@ -396,29 +442,14 @@ def _safe_pose_from_dets(dets: List[PoseResult], frame_idx: int) -> List[PoseRes
     After the loop, interpolate missing poses in pose_seq before running metrics.
     Add debug prints when a pose is missing and when interpolation is performed.
     """
     safe_poses = []
     missing_mask = []
     prev_pose = None
-    for det in dets:
-        if det.frame_index == frame_idx:
-            if prev_pose is None:
-                print(f"Warning: No previous pose found for frame {frame_idx}")
-                safe_poses.append(PoseResult(keypoints=[], frame_index=frame_idx))
-                missing_mask.append(True)
-            else:
-                safe_poses.append(PoseResult(keypoints=prev_pose.keypoints, frame_index=frame_idx))
-                missing_mask.append(False)
-            prev_pose = det
-        elif det.frame_index > frame_idx:
-            break
-    if prev_pose is None:
-        print(f"Warning: No poses found for frame {frame_idx}")
-        safe_poses.append(PoseResult(keypoints=[], frame_index=frame_idx))
-        missing_mask.append(True)
-    else:
-        safe_poses.append(PoseResult(keypoints=prev_pose.keypoints, frame_index=frame_idx))
-        missing_mask.append(False)
-    return safe_poses, missing_mask

                 confidence=float(score),
                 name=self.KEYPOINT_NAMES[i]
             ))
+        if keypoints: # MoveNet is single-pose, so only one result if any
             return [PoseResult(keypoints=keypoints, frame_index=0)]
         else:
+            return [] # No pose detected or all keypoints were NaN
     def get_keypoint_names(self) -> List[str]:
         return self.KEYPOINT_NAMES.copy()
             import mediapipe as mp
             self.mp_pose = mp.solutions.pose
             self.pose = self.mp_pose.Pose(
+                static_image_mode=False, # Process video frames
                 model_complexity=self.model_complexity,
                 min_detection_confidence=self.min_detection_confidence,
+                min_tracking_confidence=0.5 # Default from MediaPipe
             )
         except ImportError:
             raise ImportError("MediaPipe required. Install with: pip install mediapipe")
             keypoints.append(Keypoint(
                 x=landmark.x,
                 y=landmark.y,
+                confidence=landmark.visibility if hasattr(landmark, 'visibility') else 1.0, # Use visibility as confidence
                 name=self.LANDMARK_NAMES[i] if i < len(self.LANDMARK_NAMES) else f"landmark_{i}"
             ))
+        # MediaPipe Pose API typically returns one pose per image in this configuration
         return [PoseResult(keypoints=keypoints, frame_index=0)]
     def get_keypoint_names(self) -> List[str]:
         Initialize YOLO pose model.
         Args:
+            model_version: "v8" or "v11" (Note: v11 is hypothetical here as Ultralytics primarily focuses on v8, v9, etc.)
             model_size: Model size - "n" (nano), "s" (small), "m" (medium), "l" (large), "x" (xlarge)
+            confidence_threshold: Minimum confidence for person detections (not individual keypoints)
         """
+        self.model_version = model_version.lower()
+        self.model_size = model_size.lower()
+        self.confidence_threshold = confidence_threshold # This is for the main object detection
         self.model = None
         # Determine model path
+        if self.model_version == "v8":
+            self.model_path = f"yolov8{self.model_size}-pose.pt"
+        elif self.model_version == "v11": # Assuming v11 follows a similar naming, adjust if official names differ
+            self.model_path = f"yolov11{self.model_size}-pose.pt" # This might be a placeholder if v11 isn't standard Ultralytics
+        else:
+            raise ValueError(f"Unsupported YOLO version: {model_version}")
         self._load_model()
             self._load_model()
         # Run inference
+        # conf is for person detection; keypoint confidences are separate
+        results = self.model(frame, conf=self.confidence_threshold, iou=0.7)
         pose_results = []
+        height, width = frame.shape[:2]
+        # Process each detection result (Ultralytics Results object)
+        for r_idx, r in enumerate(results):
+            if r.keypoints is not None and hasattr(r.keypoints, 'data'):
+                # r.keypoints.data is a tensor of shape (num_persons, num_keypoints, 3)
+                # The last dimension is [x_pixel, y_pixel, confidence_keypoint]
+                for person_idx, keypoints_data_tensor in enumerate(r.keypoints.data):
+                    keypoints_list_for_person = keypoints_data_tensor.cpu().tolist() # Convert tensor to list
+                    keypoints = []
+                    for i, (x_pixel, y_pixel, kp_conf) in enumerate(keypoints_list_for_person):
                         # Sanitize NaN values
+                        if any(map(math.isnan, [x_pixel, y_pixel, kp_conf])):
                             continue
+                        current_confidence = float(kp_conf)
+                        # According to Ultralytics/COCO, missing keypoints are often (0,0) with conf 0.
+                        # If (0,0) pixel coords are returned with non-zero confidence by the model,
+                        # it might be an artifact or a misinterpretation.
+                        # We will reduce confidence for (0,0) pixel points if their original confidence isn't extremely high,
+                        # to help filter them in downstream tasks (visualization, analysis).
+                        if float(x_pixel) == 0.0 and float(y_pixel) == 0.0 and current_confidence < 0.9:
+                             # Threshold 0.9 is arbitrary, means "only trust (0,0) if model is super sure"
+                            current_confidence = 0.0
                         keypoints.append(Keypoint(
+                            x=float(x_pixel) / width if width > 0 else 0.0,  # Normalize
+                            y=float(y_pixel) / height if height > 0 else 0.0, # Normalize
+                            confidence=current_confidence,
                             name=self.KEYPOINT_NAMES[i] if i < len(self.KEYPOINT_NAMES) else f"joint_{i}"
                         ))
                     if keypoints:
+                        # Create a unique person ID if not available from tracker (e.g. r.boxes.id)
+                        # For simplicity, using r_idx (result index) and person_idx (index within this result)
+                        # This might not be persistent across frames without a tracker.
+                        unique_person_id = person_idx # Or a more robust ID if tracking is used
                         pose_results.append(PoseResult(
                             keypoints=keypoints,
+                            frame_index=0, # Will be updated by detect_batch
+                            person_id=unique_person_id
                         ))
         return pose_results
     # MoveNet variants
     elif model_spec.startswith("movenet"):
         variant = "lightning" if "lightning" in model_spec else "thunder"
+        if "lightning" not in model_spec and "thunder" not in model_spec: # e.g. "movenet"
+             variant = "lightning" # Default MoveNet to lightning
         return create_pose_estimator("movenet", model_variant=variant)
     # YOLO variants
     elif model_spec.startswith("yolo"):
         parts = model_spec.split("-")
+        # yolo-v8-n -> parts = ["yolo", "v8", "n"]
+        # yolo -> parts = ["yolo"] -> default to v8-n
+        version = "v8" # default version
+        size = "n" # default size
+        if len(parts) > 1: # "yolo-v8" or "yolo-v11"
+            if parts[1] in ["v8", "v11"]: # Add other versions as needed
+                version = parts[1]
+            # If parts[1] is a size (e.g. "yolo-n"), then version remains default "v8" and size is parts[1]
+            elif parts[1] in ["n", "s", "m", "l", "x"]:
+                 size = parts[1]
+        if len(parts) > 2: # "yolo-v8-n"
+            if parts[2] in ["n", "s", "m", "l", "x"]:
+                size = parts[2]
+        # Handle case like "yolo-s" where version is implied as v8
+        if len(parts) == 2 and parts[1] in ["n","s","m","l","x"]:
+            version = "v8" # Default to v8 if only size is specified after "yolo-"
+            size = parts[1]
         return create_pose_estimator("yolo", model_version=version, model_size=size)
+    # Legacy format support or direct name
     else:
+        try:
+            return create_pose_estimator(model_spec)
+        except ValueError: # If model_spec isn't a direct key like "mediapipe", "movenet", "yolo"
+             raise ValueError(f"Invalid or unsupported model specification: {model_spec}")
 def _safe_pose_from_dets(dets: List[PoseResult], frame_idx: int) -> List[PoseResult]:
     After the loop, interpolate missing poses in pose_seq before running metrics.
     Add debug prints when a pose is missing and when interpolation is performed.
     """
+    # This function is currently not used in the provided codebase.
+    # If it were to be used, it would need proper integration.
+    print(f"[DEBUG] _safe_pose_from_dets called for frame {frame_idx}, but is not currently integrated.")
     safe_poses = []
     missing_mask = []
     prev_pose = None
+    # This logic seems flawed for its intended purpose without further context or modification.
+    # For now, returning empty or passed 'dets' might be safer if it's not fully implemented.
+    # Returning dets as is, since the function is not used.
+    return dets, []

examples/mediapipe.json ADDED Viewed

The diff for this file is too large to render. See raw diff

examples/movenet.json ADDED Viewed

The diff for this file is too large to render. See raw diff

examples/yolov11.json ADDED Viewed

The diff for this file is too large to render. See raw diff

examples/yolov8.json ADDED Viewed

The diff for this file is too large to render. See raw diff