Upload model

Browse files

Files changed (5) hide show

README.md +199 -0
config.json +41 -0
configuration_rf_detr.py +93 -0
model.safetensors +3 -0
modeling_rf_detr.py +158 -0

README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "amp": true,
+  "architectures": [
+    "RFDetrModelForObjectDetection"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_rf_detr.RFDetrConfig",
+    "AutoModelForObjectDetection": "modeling_rf_detr.RFDetrModelForObjectDetection"
+  },
+  "bbox_reparam": true,
+  "ca_nheads": 16,
+  "dec_layers": 3,
+  "dec_n_points": 2,
+  "device": "cpu",
+  "encoder": "dinov2_windowed_small",
+  "gradient_checkpointing": false,
+  "group_detr": 13,
+  "hidden_dim": 256,
+  "layer_norm": true,
+  "lite_refpoint_refine": true,
+  "model_name": "RFDETRBase",
+  "model_type": "rf-detr",
+  "num_classes": 90,
+  "num_queries": 300,
+  "out_feature_indexes": [
+    2,
+    5,
+    8,
+    11
+  ],
+  "pretrain_weights": "rf-detr-base.pth",
+  "pretrained": true,
+  "projector_scale": [
+    "P4"
+  ],
+  "resolution": 560,
+  "sa_nheads": 8,
+  "torch_dtype": "float32",
+  "transformers_version": "4.50.3",
+  "two_stage": true
+}

configuration_rf_detr.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from typing import Dict, Literal, List, OrderedDict
+import torch
+from transformers.configuration_utils import PretrainedConfig
+from optimum.exporters.onnx.model_configs import ViTOnnxConfig
+### modified from https://github.com/roboflow/rf-detr/blob/main/rfdetr/config.py
+DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+class RFDetrConfig(PretrainedConfig):
+    model_type = 'rf-detr'
+    def __init__(
+        self,
+        model_name: Literal['RFDETRBase, RFDETRLarge'] = 'RFDETRBase',
+        pretrained: bool = True,
+        out_feature_indexes: List[int] = [2, 5, 8, 11],
+        dec_layers: int = 3,
+        two_stage: bool = True,
+        bbox_reparam: bool = True,
+        lite_refpoint_refine: bool = True,
+        layer_norm: bool = True,
+        amp: bool = True,
+        num_classes: int = 90,
+        num_queries: int  = 300,
+        device: Literal["cpu", "cuda", "mps"] = DEVICE,
+        resolution: int = 560,
+        group_detr: int = 13,
+        gradient_checkpointing: bool = False,
+        **kwargs
+    ):
+        self.model_name = model_name
+        self.pretrained = pretrained
+        self.out_feature_indexes = out_feature_indexes
+        self.dec_layers = dec_layers
+        self.two_stage = two_stage
+        self.bbox_reparam = bbox_reparam
+        self.lite_refpoint_refine = lite_refpoint_refine
+        self.layer_norm = layer_norm
+        self.amp = amp
+        self.num_classes = num_classes
+        self.device = device
+        self.resolution = resolution
+        self.group_detr = group_detr
+        self.gradient_checkpointing = gradient_checkpointing
+        self.num_queries = num_queries
+        if self.model_name == 'RFDETRBase':
+            self.encoder = "dinov2_windowed_small"
+            self.hidden_dim = 256
+            self.sa_nheads = 8
+            self.ca_nheads = 16
+            self.dec_n_points = 2
+            self.projector_scale = ["P4"]
+            self.pretrain_weights = "rf-detr-base.pth"
+        elif self.model_name == 'RFDETRLarge':
+            self.encoder = "dinov2_windowed_base"
+            self.hidden_dim = 384
+            self.sa_nheads = 12
+            self.ca_nheads = 24
+            self.dec_n_points = 4
+            self.projector_scale = ["P3", "P5"]
+            self.pretrain_weights = "rf-detr-large.pth"
+        if not self.pretrained:
+            self.pretrain_weights = ""
+        super().__init__(**kwargs)
+class RFDetrOnnxConfig(ViTOnnxConfig):
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        return OrderedDict(
+            {
+                "pixel_values": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"},
+                "pixel_mask": {0: "batch_size", 2: "height", 3: "width"},
+            }
+        )
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        common_outputs = super().outputs
+        if self.task == "object-detection":
+            common_outputs["logits"] = {0: "batch_size", 1: "num_queries", 2: "num_classes"}
+            common_outputs["pred_boxes"] = {0: "batch_size", 1: "num_queries", 2: "4"}
+        return common_outputs
+__all__ = [
+    'RFDetrConfig',
+    'RFDetrOnnxConfig'
+]

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e111471a1b37b21f6970075eb663e383b63cf99585968e3f67c2cc1507511a02
+size 128760872

modeling_rf_detr.py ADDED Viewed

	@@ -0,0 +1,158 @@

+from dataclasses import dataclass
+from typing import List, Dict
+import torch
+from torchvision.transforms import Resize
+from transformers import PreTrainedModel
+from transformers.utils import ModelOutput
+from rfdetr import RFDETRBase, RFDETRLarge
+from rfdetr.util.misc import NestedTensor
+from .configuration_rf_detr import RFDetrConfig
+@dataclass
+class RFDetrObjectDetectionOutput(ModelOutput):
+    loss: torch.Tensor = None
+    loss_dict: Dict[str, torch.Tensor] = None
+    logits: torch.FloatTensor = None
+    pred_boxes: torch.FloatTensor = None
+    aux_outputs: List[Dict[str, torch.Tensor]] = None
+    enc_outputs: Dict[str, torch.Tensor] = None
+class RFDetrModelForObjectDetection(PreTrainedModel):
+    config_class = RFDetrConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        models = {
+            'RFDETRBase': RFDETRBase,
+            'RFDETRLarge': RFDETRLarge,
+        }
+        rf_detr_model = models[config.model_name](
+            out_feature_indexes = config.out_feature_indexes,
+            dec_layers = config.dec_layers,
+            two_stage = config.two_stage,
+            bbox_reparam = config.bbox_reparam,
+            lite_refpoint_refine = config.lite_refpoint_refine,
+            layer_norm = config.layer_norm,
+            amp = config.amp,
+            num_classes = config.num_classes,
+            device = config.device,
+            resolution = config.resolution,
+            group_detr = config.group_detr,
+            gradient_checkpointing = config.gradient_checkpointing,
+            num_queries = config.num_queries,
+            encoder = config.encoder,
+            hidden_dim = config.hidden_dim,
+            sa_nheads = config.sa_nheads,
+            ca_nheads = config.ca_nheads,
+            dec_n_points = config.dec_n_points,
+            projector_scale = config.projector_scale,
+            pretrain_weights = config.pretrain_weights,
+        )
+        self.model = rf_detr_model.model.model
+        self.criterion = rf_detr_model.model.criterion
+    def compute_loss(self, labels, outputs):
+        """
+        Parameters
+        ----------
+            labels: list[Dict[str, torch.Tensor]]
+                list of bounding boxes and labels for each image in the batch.
+            outputs:
+                outputs from rfdetr model
+        """
+        loss = None
+        loss_dict = None
+        if self.model.training:
+            if labels is None:
+                torch._assert(False, "targets should not be none when in training mode")
+            else:
+                losses = self.criterion(outputs, targets=labels)
+                loss_dict = {
+                    'loss_fl': losses["loss_ce"],
+                    'class_error': losses["class_error"],
+                    'cardinality_error': losses["cardinality_error"],
+                    'loss_bbox': losses["loss_bbox"],
+                    'loss_giou': losses["loss_giou"],
+                }
+                loss = sum(loss_dict[k] for k in ['loss_fl', 'loss_bbox', 'loss_giou'])
+        return loss, loss_dict
+    def validate_labels(self, labels):
+        # Check for degenerate boxes
+        for label_idx, label in enumerate(labels):
+            boxes = label["boxes"]
+            degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
+            if degenerate_boxes.any():
+                # print the first degenerate box
+                bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0]
+                degen_bb: List[float] = boxes[bb_idx].tolist()
+                torch._assert(
+                    False,
+                    "All bounding boxes should have positive height and width."
+                    f" Found invalid box {degen_bb} for target at index {label_idx}.",
+                )
+            # rename key class_labels to labels for compute_loss
+            if 'class_labels' in label.keys():
+                label['labels'] = label.pop('class_labels')
+    def resize_labels(self, labels, h, w):
+        hr = self.config.resolution / float(h)
+        wr = self.config.resolution / float(w)
+        for label in labels:
+            boxes = label["boxes"].to(device=self.config.device, dtype=torch.float32)
+            # resize boxes to model's resolution
+            boxes[:, 0] *= wr
+            boxes[:, 1] *= hr
+            boxes[:, 2] *= wr
+            boxes[:, 3] *= hr
+            # convert top left to center x, y
+            boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2
+            boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2
+            # normalize to [0, 1] by model's resolution
+            boxes[:] /= self.config.resolution
+            label["boxes"] = boxes
+            if "labels" in label:
+                label["labels"] = label["labels"].to(self.config.device)
+    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor, labels=None, **kwargs) -> ModelOutput:
+        resize = Resize((self.config.resolution, self.config.resolution)) # resize pixel values and mask to model's resolution
+        pixel_values = pixel_values.to(self.config.device)
+        pixel_mask = pixel_mask.to(self.config.device)
+        pixel_values = resize(pixel_values)
+        pixel_mask = resize(pixel_mask)
+        if labels is not None:
+            self.validate_labels(labels)
+            _, _, h, w = pixel_values.shape
+            self.resize_labels(labels, h, w) # reshape labels with model's resolution
+        else:
+            self.model.training = False
+            self.model.transformer.training = False
+            for layer in self.model.transformer.decoder.layers:
+                layer.training = False
+            self.criterion.training = False
+        samples = NestedTensor(pixel_values, pixel_mask)
+        outputs = self.model(samples)
+        loss, loss_dict = self.compute_loss(labels, outputs)
+        return RFDetrObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=outputs["pred_logits"],
+            pred_boxes=outputs["pred_boxes"],
+            aux_outputs=outputs["aux_outputs"],
+            enc_outputs=outputs["enc_outputs"],
+        )
+__all__ = [
+    "RFDetrModelForObjectDetection"
+]