akore
/

rtmdet-tiny

@@ -28,7 +28,9 @@ class DetectionOutput(ModelOutput):
     Args:
         boxes (`torch.FloatTensor` of shape `(batch_size, num_boxes, 4)`):
-            Detection boxes in format [x1, y1, x2, y2].
         scores (`torch.FloatTensor` of shape `(batch_size, num_boxes)`):
             Detection confidence scores.
         labels (`torch.LongTensor` of shape `(batch_size, num_boxes)`):
@@ -1817,6 +1819,7 @@ class RTMDetModel(PreTrainedModel):
     def forward(
         self,
         pixel_values=None,
         labels=None,
         output_hidden_states=None,
         return_dict=None,
@@ -1826,11 +1829,15 @@ class RTMDetModel(PreTrainedModel):
         Args:
             pixel_values (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`):
-                Pixel values. Pixel values can be obtained using
-                RTMDetImageProcessor.
             labels (`List[Dict]`, *optional*):
-                Labels for computing the detection loss. Expected format:
-                List of dicts with 'boxes' and 'labels' keys.
             output_hidden_states (`bool`, *optional*):
                 Whether or not to return the hidden states of all layers.
             return_dict (`bool`, *optional*):
@@ -1838,9 +1845,8 @@ class RTMDetModel(PreTrainedModel):
         Returns:
             `DetectionOutput` or `tuple`:
-                If return_dict=True, `DetectionOutput` is returned.
-                If return_dict=False, a tuple is returned where the first element
-                is the detection output tensor.
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1886,6 +1892,22 @@ class RTMDetModel(PreTrainedModel):
             max_per_img=self.config.max_detections
         )
         if return_dict:
             return results
         else:

     Args:
         boxes (`torch.FloatTensor` of shape `(batch_size, num_boxes, 4)`):
+            Detection boxes in format [x1, y1, x2, y2].  Coordinates are in
+            model-input space (640×640) by default, or in original image pixel
+            space when ``original_size`` was passed to ``forward()``.
         scores (`torch.FloatTensor` of shape `(batch_size, num_boxes)`):
             Detection confidence scores.
         labels (`torch.LongTensor` of shape `(batch_size, num_boxes)`):
     def forward(
         self,
         pixel_values=None,
+        original_size=None,
         labels=None,
         output_hidden_states=None,
         return_dict=None,
         Args:
             pixel_values (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`):
+                Pixel values resized to 640×640 by the image processor.
+            original_size (`Tuple[int, int]`, *optional*):
+                ``(height, width)`` of the **original** image before preprocessing.
+                When supplied, the returned boxes are automatically scaled from
+                640×640 model-input space to original image pixel coordinates so
+                the caller never needs to compute ``sx = orig_w / 640`` manually.
+                All images in the batch are assumed to share the same original size.
             labels (`List[Dict]`, *optional*):
+                Labels for computing the detection loss.
             output_hidden_states (`bool`, *optional*):
                 Whether or not to return the hidden states of all layers.
             return_dict (`bool`, *optional*):
         Returns:
             `DetectionOutput` or `tuple`:
+                Boxes are in 640×640 space by default, or in original image space
+                when ``original_size`` is provided.
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
             max_per_img=self.config.max_detections
         )
+        # Scale boxes from 640×640 model space → original image space if requested
+        if original_size is not None:
+            orig_h, orig_w = original_size
+            sx = orig_w / width    # width == 640
+            sy = orig_h / height   # height == 640
+            scaled_boxes = results.boxes.clone()
+            scaled_boxes[..., 0] *= sx   # x1
+            scaled_boxes[..., 2] *= sx   # x2
+            scaled_boxes[..., 1] *= sy   # y1
+            scaled_boxes[..., 3] *= sy   # y2
+            results = DetectionOutput(
+                boxes=scaled_boxes,
+                scores=results.scores,
+                labels=results.labels,
+            )
         if return_dict:
             return results
         else: