akore
/

rtmw-l-384x288

@@ -22,9 +22,15 @@ class PoseOutput(ModelOutput):
     Args:
         keypoints (`torch.FloatTensor` of shape `(batch_size, num_keypoints, 2)`):
-            Predicted keypoint coordinates in format [x, y].
         scores (`torch.FloatTensor` of shape `(batch_size, num_keypoints)`):
-            Predicted keypoint confidence scores.
         loss (`torch.FloatTensor`, *optional*):
             Loss value if training.
         pred_x (`torch.FloatTensor`, *optional*):
@@ -35,6 +41,7 @@ class PoseOutput(ModelOutput):
     keypoints: torch.FloatTensor = None
     scores: torch.FloatTensor = None
     loss: Optional[torch.FloatTensor] = None
     pred_x: Optional[torch.FloatTensor] = None
     pred_y: Optional[torch.FloatTensor] = None
@@ -1338,6 +1345,8 @@ class RTMWModel(PreTrainedModel):
     def forward(
         self,
         pixel_values=None,
         labels=None,
         output_hidden_states=None,
         return_dict=None,
@@ -1347,8 +1356,28 @@ class RTMWModel(PreTrainedModel):
         Args:
             pixel_values (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`):
-                Pixel values. Pixel values can be obtained using
-                RTMWImageProcessor.
             labels (`List[Dict]`, *optional*):
                 Labels for computing the pose estimation loss.
             output_hidden_states (`bool`, *optional*):
@@ -1361,6 +1390,7 @@ class RTMWModel(PreTrainedModel):
                 If return_dict=True, `PoseOutput` is returned.
                 If return_dict=False, a tuple is returned with keypoints and scores.
         """
         return_dict = return_dict if return_dict is not None else True
         # Get inputs
@@ -1400,10 +1430,59 @@ class RTMWModel(PreTrainedModel):
                 0.0, 1.0,
             )
         if return_dict:
             return PoseOutput(
                 keypoints=keypoints,
                 scores=scores,
                 pred_x=pred_x,
                 pred_y=pred_y
             )

     Args:
         keypoints (`torch.FloatTensor` of shape `(batch_size, num_keypoints, 2)`):
+            Predicted keypoint coordinates in format [x, y]. The coordinate system
+            depends on the `coordinate_mode` passed to `forward()`:
+            - ``"model"``        — raw SimCC space (model input resolution, e.g. 288×384 px)
+            - ``"image"``        — original image space, scaled via the supplied `bbox`
+            - ``"root_relative"`` — root-normalised: origin at mid-hip, unit = half hip-to-hip dist
         scores (`torch.FloatTensor` of shape `(batch_size, num_keypoints)`):
+            Predicted keypoint confidence scores in [0, 1].
+        coordinate_mode (`str`):
+            Which coordinate system `keypoints` is expressed in (mirrors the arg passed to forward).
         loss (`torch.FloatTensor`, *optional*):
             Loss value if training.
         pred_x (`torch.FloatTensor`, *optional*):
     keypoints: torch.FloatTensor = None
     scores: torch.FloatTensor = None
+    coordinate_mode: Optional[str] = None
     loss: Optional[torch.FloatTensor] = None
     pred_x: Optional[torch.FloatTensor] = None
     pred_y: Optional[torch.FloatTensor] = None
     def forward(
         self,
         pixel_values=None,
+        bbox=None,
+        coordinate_mode: str = "image",
         labels=None,
         output_hidden_states=None,
         return_dict=None,
         Args:
             pixel_values (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`):
+                Pixel values cropped and resized to the model's input resolution
+                (e.g. 288×384).  Use `RTMWImageProcessor` or prepare manually with
+                ImageNet normalisation.
+            bbox (`torch.FloatTensor` of shape `(batch_size, 4)` or `(4,)`, *optional*):
+                Person bounding boxes in the **original** image, as
+                ``[x1, y1, x2, y2]`` pixel coordinates.  Required when
+                ``coordinate_mode="image"``; ignored otherwise.
+            coordinate_mode (`str`, *optional*, defaults to ``"image"``):
+                How to express the returned keypoint coordinates:
+                - ``"model"``         — raw SimCC space (same resolution as the
+                  model input, e.g. 288×384 px).  No extra arguments needed.
+                - ``"image"``         — rescaled back to the original image pixel
+                  space using the supplied ``bbox``.  If ``bbox`` is ``None`` the
+                  output falls back to ``"model"`` space with a warning.
+                - ``"root_relative"`` — root-normalised coordinates.  The root is
+                  the midpoint of the left-hip (kp 11) and right-hip (kp 12)
+                  joints.  All keypoints are translated so the root is at the
+                  origin, then divided by half the inter-hip distance so that
+                  each hip lands at unit distance from the origin.  Applied
+                  *after* any ``"image"`` projection when both are combined
+                  (not combinable via this single arg — choose one).
             labels (`List[Dict]`, *optional*):
                 Labels for computing the pose estimation loss.
             output_hidden_states (`bool`, *optional*):
                 If return_dict=True, `PoseOutput` is returned.
                 If return_dict=False, a tuple is returned with keypoints and scores.
         """
+        import warnings
         return_dict = return_dict if return_dict is not None else True
         # Get inputs
                 0.0, 1.0,
             )
+        # ── Coordinate transform ──────────────────────────────────────────────
+        # Keypoints are currently in model-input space:
+        #   x in [0, model_w),  y in [0, model_h)
+        # e.g. model_w=288, model_h=384 for rtmw-l-384x288.
+        if coordinate_mode == "image":
+            if bbox is None:
+                warnings.warn(
+                    "coordinate_mode='image' requires bbox=[x1,y1,x2,y2] per image. "
+                    "Falling back to model-space coordinates.",
+                    UserWarning, stacklevel=2,
+                )
+                coordinate_mode = "model"
+            else:
+                # bbox: (B, 4) or (4,) → normalise to (B, 1, 2) broadcast shape
+                bbox_t = torch.as_tensor(bbox, dtype=keypoints.dtype, device=keypoints.device)
+                if bbox_t.dim() == 1:
+                    bbox_t = bbox_t.unsqueeze(0).expand(keypoints.shape[0], -1)
+                model_h = pixel_values.shape[2]   # H dim of model input
+                model_w = pixel_values.shape[3]   # W dim of model input
+                x1 = bbox_t[:, 0:1]   # (B, 1)
+                y1 = bbox_t[:, 1:2]
+                x2 = bbox_t[:, 2:3]
+                y2 = bbox_t[:, 3:4]
+                scale_x = (x2 - x1) / model_w    # (B, 1)
+                scale_y = (y2 - y1) / model_h    # (B, 1)
+                # (B, K, 2) — broadcast over K
+                keypoints = keypoints.clone()
+                keypoints[:, :, 0] = keypoints[:, :, 0] * scale_x + x1
+                keypoints[:, :, 1] = keypoints[:, :, 1] * scale_y + y1
+        elif coordinate_mode == "root_relative":
+            # Root = midpoint of left_hip (11) and right_hip (12).
+            # Scale = half the inter-hip distance so each hip is at unit
+            # distance from the root.  Clamp to ≥1 px to guard against
+            # degenerate detections where the hips are co-located.
+            left_hip  = keypoints[:, 11, :]                             # (B, 2)
+            right_hip = keypoints[:, 12, :]                             # (B, 2)
+            root      = 0.5 * (left_hip + right_hip)                    # (B, 2)
+            scale     = (0.5 * torch.norm(right_hip - left_hip, dim=-1, keepdim=True)  # (B, 1)
+                         .clamp(min=1.0))
+            keypoints = (keypoints - root.unsqueeze(1)) / scale.unsqueeze(1)
+        elif coordinate_mode != "model":
+            raise ValueError(
+                f"coordinate_mode must be 'model', 'image', or 'root_relative', got {coordinate_mode!r}"
+            )
+        # ─────────────────────────────────────────────────────────────────────
         if return_dict:
             return PoseOutput(
                 keypoints=keypoints,
                 scores=scores,
+                coordinate_mode=coordinate_mode,
                 pred_x=pred_x,
                 pred_y=pred_y
             )