Spaces:

XAI
/

PEEB

Running on Zero

App Files Files Community

Peijie commited on Apr 23, 2025

Commit

5ac407d

1 Parent(s): c898769

fix bug and tested locally.

Browse files

Files changed (2) hide show

utils/model.py +211 -262
utils/predict.py +1 -1

utils/model.py CHANGED Viewed

@@ -157,7 +157,7 @@ class OwlViTForClassification(nn.Module):
     config_class = OwlViTConfig
     def __init__(self, owlvit_det_model, num_classes, weight_dict, device, freeze_box_heads=False, train_box_heads_only=False, network_type=None, logits_from_teacher=False, finetuned: bool = False, custom_box_head: bool = False):
-        super(OwlViTForClassification, self).__init__()
         self.config = owlvit_det_model.config
         self.num_classes = num_classes
@@ -202,12 +202,12 @@ class OwlViTForClassification(nn.Module):
         losses += ["boxes"] if weight_dict["loss_bbox"] > 0 else []
         losses += ["labels"] if weight_dict["loss_ce"] > 0 else []
-        self.criterion = DetrLoss(
-            matcher=None,
-            num_parts=self.num_parts,
-            eos_coef=0.1,   # Following facebook/detr-resnet-50
-            losses=losses,
-        )
         self.freeze_parameters(freeze_box_heads, train_box_heads_only)
         del owlvit_det_model
@@ -417,22 +417,7 @@ class OwlViTForClassification(nn.Module):
             topk_scores, topk_idxs = torch.topk(teacher_boxes_logits, k=1, dim=1)
         else:
-            #DEUBUG:
-            print(f"text_inputs_parts - input_ids: {text_inputs_parts['input_ids'].shape}. attention_mask : {text_inputs_parts['attention_mask'].shape}")
-            seq_length = text_inputs_parts['input_ids'].shape[-1]
-            position_ids = self.owlvit.text_model.embeddings.position_ids[:, :seq_length]
-            txt_embeds = self.owlvit.text_model.embeddings.token_embedding(text_inputs_parts['input_ids'])
-            print(f"position_embedding: {self.owlvit.text_model.embeddings.position_embedding(position_ids).shape}")
-            print(f"text_embeds: {txt_embeds.shape}")
-            device_ = txt_embeds.device
-            position_ids = position_ids.to(device_)
-            txt_embeds_size_0 = text_embeds.size(0)
-            position_embedding = position_ids.cpu().repeat(txt_embeds_size_0, 1, 1)
-            text_inputs_parts["position_ids"] = position_ids
-            print(f"position_embedding : {position_embedding.shape}")
-            print(f"pos + emb: {(txt_embeds.cpu() + position_embedding).shape}")
-            text_embeds_parts = self.owlvit.text_model.get_text_features(**text_inputs_parts)
             # # Embed images and text queries
             query_mask, text_embeds_parts = self._get_text_query_mask(text_inputs_parts, text_embeds_parts, batch_size)
@@ -460,46 +445,10 @@ class OwlViTForClassification(nn.Module):
                 outputs_loss["logits"] = pred_logits_parts
                 outputs_loss["pred_boxes"] = pred_boxes
-                # Compute box + class losses
-                loss_dict = self.criterion(outputs_loss, targets, mapping_indices)
-                # Compute symmetric loss to get rid of the teacher model
-                logits_per_image = torch.softmax(pred_logits_parts, dim=1)
-                logits_per_text = torch.softmax(pred_logits_parts, dim=-1)
-                # For getting rid of the teacher model
-                if self.weight_dict["loss_sym_box_label"] > 0:
-                    sym_loss_box_label = self.loss_symmetric(logits_per_image, logits_per_text, teacher_boxes_logits)
-                    loss_dict["loss_sym_box_label"] = sym_loss_box_label
-                # ----------------------------------------------------------------------------------------
-        #DEBUG:
-        print(f"im_features size: {image_feats.shape}, text_embeds size: {text_embeds.shape}")
-        print(f"im_features sum: {image_feats.sum().item()}, text_embeds sum: {text_embeds.sum().item()}")
         # Predict image-level classes (batch_size, num_patches, num_queries)
         image_text_logits, pred_logits, part_logits = self.cls_head(image_feats, text_embeds, topk_idxs)
-        print(f"topk_idxs: {topk_idxs}")
-        print(f"image_text_logits size: {image_text_logits.shape}")
-        print(f"image_text_logits sum: {image_text_logits.sum().item()}")
-        if self.weight_dict["loss_xclip"] > 0:
-            targets_cls = torch.tensor([target["targets_cls"] for target in targets]).unsqueeze(1).to(self.device)
-            if self.network_type == "classification":
-                one_hot = torch.zeros_like(pred_logits).scatter(1, targets_cls, 1).to(self.device)
-                cls_loss = self.ce_loss(pred_logits, one_hot)
-                loss_dict["loss_xclip"] = cls_loss
-            else:
-                # TODO: Need a linear classifier for this approach
-                # Compute symmetric loss for part-descriptor contrastive learning
-                logits_per_image = torch.softmax(image_text_logits, dim=0)
-                logits_per_text = torch.softmax(image_text_logits, dim=-1)
-                sym_loss = self.loss_symmetric(logits_per_image, logits_per_text, targets_cls)
-                loss_dict["loss_xclip"] = sym_loss
-        #DEBUG:
-        print(f"pred_logits size: {part_logits.shape}, pred_logits size: {part_logits.shape}")
-        print(f"part_logits sum: {pred_logits.sum().item()}, part_logits sum: {pred_logits.sum().item()}")
-        return pred_logits, part_logits, loss_dict
     def loss_symmetric(self, text_logits: torch.Tensor, image_logits: torch.Tensor, targets: torch.Tensor, box_labels: torch.Tensor = None) -> torch.Tensor:
         # text/image logits (batch_size*num_boxes, num_classes*num_descs): The logits that softmax over text descriptors or boxes
@@ -537,204 +486,204 @@ class OwlViTForClassification(nn.Module):
         return sym_loss
-class DetrLoss(nn.Module):
-    """
-    This class computes the losses for DetrForObjectDetection/DetrForSegmentation. The process happens in two steps: 1)
-    we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair
-    of matched ground-truth / prediction (supervise class and box).
-    A note on the `num_classes` argument (copied from original repo in detr.py): "the naming of the `num_classes`
-    parameter of the criterion is somewhat misleading. It indeed corresponds to `max_obj_id` + 1, where `max_obj_id` is
-    the maximum id for a class in your dataset. For example, COCO has a `max_obj_id` of 90, so we pass `num_classes` to
-    be 91. As another example, for a dataset that has a single class with `id` 1, you should pass `num_classes` to be 2
-    (`max_obj_id` + 1). For more details on this, check the following discussion
-    https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223"
-    Args:
-        matcher (`DetrHungarianMatcher`):
-            Module able to compute a matching between targets and proposals.
-        num_parts (`int`):
-            Number of object categories, omitting the special no-object category.
-        eos_coef (`float`):
-            Relative classification weight applied to the no-object category.
-        losses (`List[str]`):
-            List of all the losses to be applied. See `get_loss` for a list of all available losses.
-    """
-    def __init__(self, matcher, num_parts, eos_coef, losses):
-        super().__init__()
-        self.matcher = matcher
-        self.num_parts = num_parts
-        self.eos_coef = eos_coef
-        self.losses = losses
-        # empty_weight = torch.ones(self.num_parts + 1)
-        empty_weight = torch.ones(self.num_parts)
-        empty_weight[-1] = self.eos_coef
-        self.register_buffer("empty_weight", empty_weight)
-    # removed logging parameter, which was part of the original implementation
-    def loss_labels(self, outputs, targets, indices, num_boxes):
-        """
-        Classification loss (NLL) targets dicts must contain the key "class_labels" containing a tensor of dim
-        [nb_target_boxes]
-        """
-        if "logits" not in outputs:
-            raise KeyError("No logits were found in the outputs")
-        source_logits = outputs["logits"]
-        idx = self._get_source_permutation_idx(indices)
-        # target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
-        # target_classes = torch.full(source_logits.shape[:2], self.num_parts, dtype=torch.int64, device=source_logits.device)
-        # target_classes[idx] = target_classes_o
-        source_logits = source_logits[idx].view(len(indices), -1, self.num_parts)
-        target_classes = torch.stack([t["class_labels"][J] for t, (_, J) in zip(targets, indices)], dim=0)
-        loss_ce = nn.functional.cross_entropy(source_logits.transpose(1, 2), target_classes, self.empty_weight)
-        losses = {"loss_ce": loss_ce}
-        return losses
-    @torch.no_grad()
-    def loss_cardinality(self, outputs, targets, indices, num_boxes):
-        """
-        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
-        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
-        """
-        logits = outputs["logits"]
-        device = logits.device
-        target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
-        # Count the number of predictions that are NOT "no-object" (which is the last class)
-        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
-        card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
-        losses = {"cardinality_error": card_err}
-        return losses
-    def loss_boxes(self, outputs, targets, indices, num_boxes):
-        """
-        Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
-        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
-        are expected in format (center_x, center_y, w, h), normalized by the image size.
-        """
-        if "pred_boxes" not in outputs:
-            raise KeyError("No predicted boxes found in outputs")
-        idx = self._get_source_permutation_idx(indices)
-        source_boxes = outputs["pred_boxes"][idx]
-        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
-        losses = {}
-        loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")
-        losses["loss_bbox"] = loss_bbox.sum() / num_boxes
-        loss_giou = 1 - torch.diag(generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes)))
-        losses["loss_giou"] = loss_giou.sum() / num_boxes
-        return losses
-    def loss_masks(self, outputs, targets, indices, num_boxes):
-        """
-        Compute the losses related to the masks: the focal loss and the dice loss.
-        Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w].
-        """
-        if "pred_masks" not in outputs:
-            raise KeyError("No predicted masks found in outputs")
-        source_idx = self._get_source_permutation_idx(indices)
-        target_idx = self._get_target_permutation_idx(indices)
-        source_masks = outputs["pred_masks"]
-        source_masks = source_masks[source_idx]
-        masks = [t["masks"] for t in targets]
-        # TODO use valid to mask invalid areas due to padding in loss
-        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
-        target_masks = target_masks.to(source_masks)
-        target_masks = target_masks[target_idx]
-        # upsample predictions to the target size
-        source_masks = nn.functional.interpolate(
-            source_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False
-        )
-        source_masks = source_masks[:, 0].flatten(1)
-        target_masks = target_masks.flatten(1)
-        target_masks = target_masks.view(source_masks.shape)
-        losses = {
-            "loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes),
-            "loss_dice": dice_loss(source_masks, target_masks, num_boxes),
-        }
-        return losses
-    def _get_source_permutation_idx(self, indices):
-        # permute predictions following indices
-        batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
-        source_idx = torch.cat([source for (source, _) in indices])
-        return batch_idx, source_idx
-    def _get_target_permutation_idx(self, indices):
-        # permute targets following indices
-        batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
-        target_idx = torch.cat([target for (_, target) in indices])
-        return batch_idx, target_idx
-    def get_loss(self, loss, outputs, targets, indices, num_boxes):
-        loss_map = {
-            "labels": self.loss_labels,
-            "cardinality": self.loss_cardinality,
-            "boxes": self.loss_boxes,
-            "masks": self.loss_masks,
-        }
-        if loss not in loss_map:
-            raise ValueError(f"Loss {loss} not supported")
-        return loss_map[loss](outputs, targets, indices, num_boxes)
-    def forward(self, outputs, targets, indices):
-        """
-        This performs the loss computation.
-        Args:
-             outputs (`dict`, *optional*):
-                Dictionary of tensors, see the output specification of the model for the format.
-             targets (`List[dict]`, *optional*):
-                List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
-                losses applied, see each loss' doc.
-        """
-        outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"}
-        # ThangPM: Do NOT use bipartite matching --> Use the boxes selected by argmax for computing symmetric loss
-        # Retrieve the matching between the outputs of the last layer and the targets
-        # indices = self.matcher(outputs_without_aux, targets)
-        # Compute the average number of target boxes across all nodes, for normalization purposes
-        num_boxes = sum(len(t["class_labels"]) for t in targets)
-        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
-        # (Niels): comment out function below, distributed training to be added
-        # if is_dist_avail_and_initialized():
-        #     torch.distributed.all_reduce(num_boxes)
-        # (Niels) in original implementation, num_boxes is divided by get_world_size()
-        num_boxes = torch.clamp(num_boxes, min=1).item()
-        # Compute all the requested losses
-        losses = {}
-        for loss in self.losses:
-            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
-        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
-        if "auxiliary_outputs" in outputs:
-            for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
-                # indices = self.matcher(auxiliary_outputs, targets)
-                for loss in self.losses:
-                    if loss == "masks":
-                        # Intermediate masks losses are too costly to compute, we ignore them.
-                        continue
-                    l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
-                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
-                    losses.update(l_dict)
-        return losses

     config_class = OwlViTConfig
     def __init__(self, owlvit_det_model, num_classes, weight_dict, device, freeze_box_heads=False, train_box_heads_only=False, network_type=None, logits_from_teacher=False, finetuned: bool = False, custom_box_head: bool = False):
+        super().__init__()
         self.config = owlvit_det_model.config
         self.num_classes = num_classes
         losses += ["boxes"] if weight_dict["loss_bbox"] > 0 else []
         losses += ["labels"] if weight_dict["loss_ce"] > 0 else []
+        # self.criterion = DetrLoss(
+        #     matcher=None,
+        #     num_parts=self.num_parts,
+        #     eos_coef=0.1,   # Following facebook/detr-resnet-50
+        #     losses=losses,
+        # )
         self.freeze_parameters(freeze_box_heads, train_box_heads_only)
         del owlvit_det_model
             topk_scores, topk_idxs = torch.topk(teacher_boxes_logits, k=1, dim=1)
         else:
+            text_embeds_parts = self.owlvit.get_text_features(**text_inputs_parts)
             # # Embed images and text queries
             query_mask, text_embeds_parts = self._get_text_query_mask(text_inputs_parts, text_embeds_parts, batch_size)
                 outputs_loss["logits"] = pred_logits_parts
                 outputs_loss["pred_boxes"] = pred_boxes
         # Predict image-level classes (batch_size, num_patches, num_queries)
         image_text_logits, pred_logits, part_logits = self.cls_head(image_feats, text_embeds, topk_idxs)
+        return pred_logits, part_logits
     def loss_symmetric(self, text_logits: torch.Tensor, image_logits: torch.Tensor, targets: torch.Tensor, box_labels: torch.Tensor = None) -> torch.Tensor:
         # text/image logits (batch_size*num_boxes, num_classes*num_descs): The logits that softmax over text descriptors or boxes
         return sym_loss
+# class DetrLoss(nn.Module):
+#     """
+#     This class computes the losses for DetrForObjectDetection/DetrForSegmentation. The process happens in two steps: 1)
+#     we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair
+#     of matched ground-truth / prediction (supervise class and box).
+#     A note on the `num_classes` argument (copied from original repo in detr.py): "the naming of the `num_classes`
+#     parameter of the criterion is somewhat misleading. It indeed corresponds to `max_obj_id` + 1, where `max_obj_id` is
+#     the maximum id for a class in your dataset. For example, COCO has a `max_obj_id` of 90, so we pass `num_classes` to
+#     be 91. As another example, for a dataset that has a single class with `id` 1, you should pass `num_classes` to be 2
+#     (`max_obj_id` + 1). For more details on this, check the following discussion
+#     https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223"
+#     Args:
+#         matcher (`DetrHungarianMatcher`):
+#             Module able to compute a matching between targets and proposals.
+#         num_parts (`int`):
+#             Number of object categories, omitting the special no-object category.
+#         eos_coef (`float`):
+#             Relative classification weight applied to the no-object category.
+#         losses (`List[str]`):
+#             List of all the losses to be applied. See `get_loss` for a list of all available losses.
+#     """
+#     def __init__(self, matcher, num_parts, eos_coef, losses):
+#         super().__init__()
+#         self.matcher = matcher
+#         self.num_parts = num_parts
+#         self.eos_coef = eos_coef
+#         self.losses = losses
+#         # empty_weight = torch.ones(self.num_parts + 1)
+#         empty_weight = torch.ones(self.num_parts)
+#         empty_weight[-1] = self.eos_coef
+#         self.register_buffer("empty_weight", empty_weight)
+#     # removed logging parameter, which was part of the original implementation
+#     def loss_labels(self, outputs, targets, indices, num_boxes):
+#         """
+#         Classification loss (NLL) targets dicts must contain the key "class_labels" containing a tensor of dim
+#         [nb_target_boxes]
+#         """
+#         if "logits" not in outputs:
+#             raise KeyError("No logits were found in the outputs")
+#         source_logits = outputs["logits"]
+#         idx = self._get_source_permutation_idx(indices)
+#         # target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
+#         # target_classes = torch.full(source_logits.shape[:2], self.num_parts, dtype=torch.int64, device=source_logits.device)
+#         # target_classes[idx] = target_classes_o
+#         source_logits = source_logits[idx].view(len(indices), -1, self.num_parts)
+#         target_classes = torch.stack([t["class_labels"][J] for t, (_, J) in zip(targets, indices)], dim=0)
+#         loss_ce = nn.functional.cross_entropy(source_logits.transpose(1, 2), target_classes, self.empty_weight)
+#         losses = {"loss_ce": loss_ce}
+#         return losses
+#     @torch.no_grad()
+#     def loss_cardinality(self, outputs, targets, indices, num_boxes):
+#         """
+#         Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
+#         This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
+#         """
+#         logits = outputs["logits"]
+#         device = logits.device
+#         target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
+#         # Count the number of predictions that are NOT "no-object" (which is the last class)
+#         card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
+#         card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
+#         losses = {"cardinality_error": card_err}
+#         return losses
+#     def loss_boxes(self, outputs, targets, indices, num_boxes):
+#         """
+#         Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
+#         Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
+#         are expected in format (center_x, center_y, w, h), normalized by the image size.
+#         """
+#         if "pred_boxes" not in outputs:
+#             raise KeyError("No predicted boxes found in outputs")
+#         idx = self._get_source_permutation_idx(indices)
+#         source_boxes = outputs["pred_boxes"][idx]
+#         target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+#         losses = {}
+#         loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")
+#         losses["loss_bbox"] = loss_bbox.sum() / num_boxes
+#         loss_giou = 1 - torch.diag(generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes)))
+#         losses["loss_giou"] = loss_giou.sum() / num_boxes
+#         return losses
+#     def loss_masks(self, outputs, targets, indices, num_boxes):
+#         """
+#         Compute the losses related to the masks: the focal loss and the dice loss.
+#         Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w].
+#         """
+#         if "pred_masks" not in outputs:
+#             raise KeyError("No predicted masks found in outputs")
+#         source_idx = self._get_source_permutation_idx(indices)
+#         target_idx = self._get_target_permutation_idx(indices)
+#         source_masks = outputs["pred_masks"]
+#         source_masks = source_masks[source_idx]
+#         masks = [t["masks"] for t in targets]
+#         # TODO use valid to mask invalid areas due to padding in loss
+#         target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+#         target_masks = target_masks.to(source_masks)
+#         target_masks = target_masks[target_idx]
+#         # upsample predictions to the target size
+#         source_masks = nn.functional.interpolate(
+#             source_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False
+#         )
+#         source_masks = source_masks[:, 0].flatten(1)
+#         target_masks = target_masks.flatten(1)
+#         target_masks = target_masks.view(source_masks.shape)
+#         losses = {
+#             "loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes),
+#             "loss_dice": dice_loss(source_masks, target_masks, num_boxes),
+#         }
+#         return losses
+#     def _get_source_permutation_idx(self, indices):
+#         # permute predictions following indices
+#         batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
+#         source_idx = torch.cat([source for (source, _) in indices])
+#         return batch_idx, source_idx
+#     def _get_target_permutation_idx(self, indices):
+#         # permute targets following indices
+#         batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
+#         target_idx = torch.cat([target for (_, target) in indices])
+#         return batch_idx, target_idx
+#     def get_loss(self, loss, outputs, targets, indices, num_boxes):
+#         loss_map = {
+#             "labels": self.loss_labels,
+#             "cardinality": self.loss_cardinality,
+#             "boxes": self.loss_boxes,
+#             "masks": self.loss_masks,
+#         }
+#         if loss not in loss_map:
+#             raise ValueError(f"Loss {loss} not supported")
+#         return loss_map[loss](outputs, targets, indices, num_boxes)
+#     def forward(self, outputs, targets, indices):
+#         """
+#         This performs the loss computation.
+#         Args:
+#              outputs (`dict`, *optional*):
+#                 Dictionary of tensors, see the output specification of the model for the format.
+#              targets (`List[dict]`, *optional*):
+#                 List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
+#                 losses applied, see each loss' doc.
+#         """
+#         outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"}
+#         # ThangPM: Do NOT use bipartite matching --> Use the boxes selected by argmax for computing symmetric loss
+#         # Retrieve the matching between the outputs of the last layer and the targets
+#         # indices = self.matcher(outputs_without_aux, targets)
+#         # Compute the average number of target boxes across all nodes, for normalization purposes
+#         num_boxes = sum(len(t["class_labels"]) for t in targets)
+#         num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+#         # (Niels): comment out function below, distributed training to be added
+#         # if is_dist_avail_and_initialized():
+#         #     torch.distributed.all_reduce(num_boxes)
+#         # (Niels) in original implementation, num_boxes is divided by get_world_size()
+#         num_boxes = torch.clamp(num_boxes, min=1).item()
+#         # Compute all the requested losses
+#         losses = {}
+#         for loss in self.losses:
+#             losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
+#         # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+#         if "auxiliary_outputs" in outputs:
+#             for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
+#                 # indices = self.matcher(auxiliary_outputs, targets)
+#                 for loss in self.losses:
+#                     if loss == "masks":
+#                         # Intermediate masks losses are too costly to compute, we ignore them.
+#                         continue
+#                     l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
+#                     l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+#                     losses.update(l_dict)
+#         return losses

utils/predict.py CHANGED Viewed

@@ -112,7 +112,7 @@ def xclip_pred(new_desc: dict,
             image_input = owlvit_processor(images=image, return_tensors='pt').to(device)
             image_embeds, _ = model.image_embedder(pixel_values = image_input['pixel_values'])
-        pred_logits, part_logits, output_dict = model(image_embeds, part_embeds, query_embeds, None)
         b, c, n = part_logits.shape
         mask = torch.tensor(desc_mask, dtype=float).unsqueeze(0).unsqueeze(0).repeat(b, c, 1).to(device)

             image_input = owlvit_processor(images=image, return_tensors='pt').to(device)
             image_embeds, _ = model.image_embedder(pixel_values = image_input['pixel_values'])
+        pred_logits, part_logits = model(image_embeds, part_embeds, query_embeds, None)
         b, c, n = part_logits.shape
         mask = torch.tensor(desc_mask, dtype=float).unsqueeze(0).unsqueeze(0).repeat(b, c, 1).to(device)