Thastp
/

rf-detr

@@ -3,6 +3,7 @@ from typing import Dict, Literal, List, OrderedDict
 import torch
 from transformers.configuration_utils import PretrainedConfig
 from optimum.exporters.onnx.model_configs import ViTOnnxConfig
 ### modified from https://github.com/roboflow/rf-detr/blob/main/rfdetr/config.py
@@ -66,7 +67,25 @@ class RFDetrConfig(PretrainedConfig):
         super().__init__(**kwargs)
 class RFDetrOnnxConfig(ViTOnnxConfig):
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
         return OrderedDict(

 import torch
 from transformers.configuration_utils import PretrainedConfig
 from optimum.exporters.onnx.model_configs import ViTOnnxConfig
+from optimum.utils import DummyVisionInputGenerator
 ### modified from https://github.com/roboflow/rf-detr/blob/main/rfdetr/config.py
         super().__init__(**kwargs)
+class RFDetrDummyInputGenerator(DummyVisionInputGenerator):
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "pixel_mask":
+            return self.random_mask_tensor(
+                shape=[self.batch_size, self.height, self.width],
+                framework=framework,
+                dtype="bool",
+            )
+        else:
+            return self.random_float_tensor(
+                shape=[self.batch_size, self.num_channels, self.height, self.width],
+                framework=framework,
+                dtype=float_dtype,
+            )
 class RFDetrOnnxConfig(ViTOnnxConfig):
+    DUMMY_INPUT_GENERATOR_CLASSES = (RFDetrDummyInputGenerator,)
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
         return OrderedDict(

modeling_rf_detr.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from dataclasses import dataclass
 from typing import List, Dict
 import torch
 from torchvision.transforms import Resize, InterpolationMode
@@ -12,6 +13,38 @@ from .configuration_rf_detr import RFDetrConfig
 ### ONLY WORKS WITH Transformers version 4.50.3 and python 3.11
 @dataclass
 class RFDetrObjectDetectionOutput(ModelOutput):
     loss: torch.Tensor = None
@@ -118,7 +151,11 @@ class RFDetrModelForObjectDetection(PreTrainedModel):
                 label["labels"] = label["labels"].to(self.config.device)
     def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor, labels=None, **kwargs) -> ModelOutput:
-        resize = Resize((self.config.resolution, self.config.resolution), interpolation=InterpolationMode.NEAREST) # interpolation mode set to nearest for onnx export
         if labels is not None:
             self.validate_labels(labels)

 from dataclasses import dataclass
 from typing import List, Dict
+import math
 import torch
 from torchvision.transforms import Resize, InterpolationMode
 ### ONLY WORKS WITH Transformers version 4.50.3 and python 3.11
+# modified from https://github.com/roboflow/rf-detr/blob/develop/rfdetr/models/backbone/dinov2.py make_new_interpolated_pos_encoding
+def _onnx_make_new_interpolated_pos_encoding(
+        position_embeddings, patch_size, height, width
+    ):
+        num_positions = position_embeddings.shape[1] - 1
+        dim = position_embeddings.shape[-1]
+        height = height // patch_size
+        width = width // patch_size
+        class_pos_embed = position_embeddings[:, 0]
+        patch_pos_embed = position_embeddings[:, 1:]
+        # Reshape and permute
+        patch_pos_embed = patch_pos_embed.reshape(
+            1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim
+        )
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        # Use bilinear interpolation without antialias
+        patch_pos_embed = F.interpolate(
+            patch_pos_embed,
+            size=(height, width),
+            mode="bicubic",
+            align_corners=False,
+            antialias=False,
+        )
+        # Reshape back
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).reshape(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
 @dataclass
 class RFDetrObjectDetectionOutput(ModelOutput):
     loss: torch.Tensor = None
                 label["labels"] = label["labels"].to(self.config.device)
     def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor, labels=None, **kwargs) -> ModelOutput:
+        if torch.jit.is_tracing():
+            resize = Resize((self.config.resolution, self.config.resolution), interpolation=InterpolationMode.NEAREST) # interpolation mode set to nearest for onnx export
+            self.model.backbone[0].encoder.encoder.embeddings.interpolate_pos_encoding = lambda self_mod, embeddings, height, width : self.model.backbone[0].encoder.encoder.embeddings.position_embeddings # skip interpolation for onnx export
+        else:
+            resize = Resize((self.config.resolution, self.config.resolution))
         if labels is not None:
             self.validate_labels(labels)