Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

README.md +31 -0
config.json +41 -0
configuration_internvl_ovd.py +170 -0
heads.py +179 -0
hungarian_matcher.py +742 -0
internvl_image_procesing.py +81 -0
internvl_ovd.py +58 -0
model.safetensors +3 -0
modeling_internvl_ovd.py +187 -0
vlmbackbone.py +195 -0

README.md ADDED Viewed

	@@ -0,0 +1,31 @@

+## InternVL-OVD (inference-only)
+This repository contains inference-only artifacts exported from a training checkpoint.
+### Quick start (single image)
+```python
+import torch
+from PIL import Image
+from transformers import AutoConfig, AutoModel, AutoTokenizer
+repo_id = "YOUR_ORG/YOUR_MODEL"
+image_path = "flower.jpg"
+query = "flower"
+cfg = AutoConfig.from_pretrained(repo_id, trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(cfg.vlm_model_name, trust_remote_code=True, use_fast=False)
+model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
+model.eval()
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = model.to(device)
+pil = Image.open(image_path).convert("RGB")
+outputs = model.infer_image(image=pil, query=query, tokenizer=tokenizer)
+pred_boxes = outputs.pred_boxes[0].float().cpu()
+pred_scores = outputs.pred_scores[0].squeeze(-1).float().sigmoid().cpu()
+print(pred_boxes[:5])
+print(pred_scores[:5])
+```

config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "architectures": [
+    "InternVLOVDForDetection"
+  ],
+  "head_type": "small",
+  "cost_bbox": 5.0,
+  "cost_class": 0.0,
+  "cost_giou": 2.0,
+  "device_map": "cuda",
+  "dim_feedforward": 1024,
+  "dropout": 0.0,
+  "dtype": "bfloat16",
+  "eos_coef": 0.1,
+  "focal_alpha": 0.75,
+  "focal_gamma": 2.0,
+  "freeze_backbone": true,
+  "hidden_size": 1024,
+  "loss_bbox": 5.0,
+  "loss_cls": 0.0,
+  "loss_giou": 2.0,
+  "loss_mode": "bbox_only",
+  "model_type": "internvl_ovd",
+  "nhead": 8,
+  "num_decoder_layers": 2,
+  "num_queries": 1,
+  "token_fpn_include_text": true,
+  "token_fpn_levels": [
+    16,
+    8,
+    4,
+    2
+  ],
+  "transformers_version": "4.57.3",
+  "use_focal_loss": false,
+  "use_token_fpn": false,
+  "vlm_model_name": "OpenGVLab/InternVL3_5-1B",
+  "auto_map": {
+    "AutoConfig": "configuration_internvl_ovd.InternVLOVDConfig",
+    "AutoModel": "modeling_internvl_ovd.InternVLOVDForDetection"
+  }
+}

configuration_internvl_ovd.py ADDED Viewed

	@@ -0,0 +1,170 @@

+from __future__ import annotations
+from typing import Any, Optional
+from transformers import PretrainedConfig
+def _as_tuple_ints(value: object, *, default: tuple[int, ...]) -> tuple[int, ...]:
+    """Normalize token_fpn_levels which may come from JSON as a list."""
+    if value is None:
+        return default
+    if isinstance(value, tuple):
+        return tuple(int(v) for v in value)
+    if isinstance(value, list):
+        return tuple(int(v) for v in value)
+    return default
+class InternVLOVDConfig(PretrainedConfig):
+    """
+    HuggingFace configuration for InternVL-OVD.
+    NOTE:
+    - For convenience this config also stores some runtime/data fields used by this repo.
+    - For Hub inference, only model-relevant fields are strictly required.
+    """
+    model_type = "internvl_ovd"
+    def __init__(
+        self,
+        # Model/backbone
+        vlm_model_name: str = "OpenGVLab/InternVL3_5-1B",
+        hidden_size: int = 1024,
+        num_queries: int = 1,
+        use_token_fpn: bool = False,
+        token_fpn_levels: tuple[int, ...] = (16, 8, 4, 2),
+        token_fpn_include_text: bool = True,
+        head_type: str = "detr",
+        nhead: int = 8,
+        num_decoder_layers: int = 2,
+        dim_feedforward: int = 1024,
+        dropout: float = 0.0,
+        dtype: str = "bfloat16",
+        device_map: str = "cuda",
+        freeze_backbone: bool = True,
+        # Loss
+        cost_bbox: float = 5.0,
+        cost_giou: float = 2.0,
+        cost_class: float = 1.0,
+        loss_bbox: float = 5.0,
+        loss_giou: float = 2.0,
+        loss_cls: float = 1.0,
+        eos_coef: float = 0.1,
+        use_focal_loss: bool = False,
+        focal_alpha: float = 0.75,
+        focal_gamma: float = 2.0,
+        loss_mode: str = "hungarian",
+        # Data/runtime (repo convenience)
+        dataset_type: str = "unknown",
+        coco_root: str = "/coco/root/path",
+        train_ann_file: Optional[str] = None,
+        val_ann_file: Optional[str] = None,
+        train_img_dir: Optional[str] = None,
+        val_img_dir: Optional[str] = None,
+        refcoco_train_split: str = "val",
+        refcoco_val_split: str = "testB",
+        refcoco_train_max_samples: Optional[int] = None,
+        refcoco_val_max_samples: Optional[int] = None,
+        max_num_patches: int = 12,
+        input_size: int = 448,
+        max_instances: int = 100,
+        num_workers: int = 4,
+        # Train/runtime (repo convenience)
+        batch_size: int = 8,
+        gradient_accumulation_steps: int = 1,
+        num_epochs: int = 50,
+        lr: float = 1e-4,
+        weight_decay: float = 1e-4,
+        lr_scheduler: str = "cosine",
+        warmup_epochs: float = 1.0,
+        max_grad_norm: float = 0.1,
+        log_every: int = 50,
+        use_wandb: bool = True,
+        wandb_project: str = "internvl-ovd",
+        wandb_run_name: Optional[str] = None,
+        save_dir: str = "./checkpoints",
+        save_every_steps: int = 1000,
+        save_total_limit: int = 5,
+        val_every_steps: int = 1000,
+        eval_on_train: bool = True,
+        train_eval_ratio: float = 0.001,
+        train_eval_max_samples: Optional[int] = 128,
+        device: str = "cuda",
+        resume_from: Optional[str] = None,
+        seed: int = 42,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(**kwargs)
+        # Model/backbone
+        self.vlm_model_name = vlm_model_name
+        self.hidden_size = hidden_size
+        self.num_queries = num_queries
+        self.use_token_fpn = use_token_fpn
+        self.token_fpn_levels = _as_tuple_ints(token_fpn_levels, default=(16, 8, 4, 2))
+        self.token_fpn_include_text = token_fpn_include_text
+        self.head_type = head_type
+        self.nhead = nhead
+        self.num_decoder_layers = num_decoder_layers
+        self.dim_feedforward = dim_feedforward
+        self.dropout = dropout
+        self.dtype = dtype
+        self.device_map = device_map
+        self.freeze_backbone = freeze_backbone
+        # Loss
+        self.cost_bbox = cost_bbox
+        self.cost_giou = cost_giou
+        self.cost_class = cost_class
+        self.loss_bbox = loss_bbox
+        self.loss_giou = loss_giou
+        self.loss_cls = loss_cls
+        self.eos_coef = eos_coef
+        self.use_focal_loss = use_focal_loss
+        self.focal_alpha = focal_alpha
+        self.focal_gamma = focal_gamma
+        self.loss_mode = loss_mode
+        # Data
+        self.dataset_type = dataset_type
+        self.coco_root = coco_root
+        self.train_ann_file = train_ann_file or f"{self.coco_root}/annotations/instances_train2017.json"
+        self.val_ann_file = val_ann_file or f"{self.coco_root}/annotations/instances_val2017.json"
+        self.train_img_dir = train_img_dir or f"{self.coco_root}/train2017"
+        self.val_img_dir = val_img_dir or f"{self.coco_root}/val2017"
+        self.refcoco_train_split = refcoco_train_split
+        self.refcoco_val_split = refcoco_val_split
+        self.refcoco_train_max_samples = refcoco_train_max_samples
+        self.refcoco_val_max_samples = refcoco_val_max_samples
+        self.max_num_patches = max_num_patches
+        self.input_size = input_size
+        self.max_instances = max_instances
+        self.num_workers = num_workers
+        # Train/runtime
+        self.batch_size = batch_size
+        self.gradient_accumulation_steps = gradient_accumulation_steps
+        self.num_epochs = num_epochs
+        self.lr = lr
+        self.weight_decay = weight_decay
+        self.lr_scheduler = lr_scheduler
+        self.warmup_epochs = warmup_epochs
+        self.max_grad_norm = max_grad_norm
+        self.log_every = log_every
+        self.use_wandb = use_wandb
+        self.wandb_project = wandb_project
+        self.wandb_run_name = wandb_run_name
+        self.save_dir = save_dir
+        self.save_every_steps = save_every_steps
+        self.save_total_limit = save_total_limit
+        self.val_every_steps = val_every_steps
+        self.eval_on_train = eval_on_train
+        self.train_eval_ratio = train_eval_ratio
+        self.train_eval_max_samples = train_eval_max_samples
+        self.device = device
+        self.resume_from = resume_from
+        self.seed = seed

heads.py ADDED Viewed

	@@ -0,0 +1,179 @@

+from __future__ import annotations
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class _CrossAttentionLayer(nn.Module):
+    def __init__(self, config: object) -> None:
+        super().__init__()
+        self.cross_attn = nn.MultiheadAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.nhead,
+            dropout=config.dropout,
+            batch_first=True,
+        )
+        self.linear1 = nn.Linear(config.hidden_size, config.dim_feedforward)
+        self.linear2 = nn.Linear(config.dim_feedforward, config.hidden_size)
+        self.norm1 = nn.LayerNorm(config.hidden_size)
+        self.norm2 = nn.LayerNorm(config.hidden_size)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(
+        self,
+        tgt: torch.Tensor,  # (B,K,D)
+        memory: torch.Tensor,  # (B,L,D)
+        memory_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # cross-attn
+        residual = tgt
+        tgt2, _ = self.cross_attn(
+            query=tgt,
+            key=memory,
+            value=memory,
+            key_padding_mask=memory_mask,
+        )
+        tgt = residual + self.dropout(tgt2)
+        tgt = self.norm1(tgt)
+        # FFN
+        residual = tgt
+        tgt2 = self.linear2(F.gelu(self.linear1(tgt)))
+        tgt = residual + self.dropout(tgt2)
+        tgt = self.norm2(tgt)
+        return tgt
+class _DetrDecoderLayer(nn.Module):
+    def __init__(self, config: object) -> None:
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.nhead,
+            dropout=config.dropout,
+            batch_first=True,
+        )
+        self.cross_attn = nn.MultiheadAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.nhead,
+            dropout=config.dropout,
+            batch_first=True,
+        )
+        self.linear1 = nn.Linear(config.hidden_size, config.dim_feedforward)
+        self.linear2 = nn.Linear(config.dim_feedforward, config.hidden_size)
+        self.norm1 = nn.LayerNorm(config.hidden_size)
+        self.norm2 = nn.LayerNorm(config.hidden_size)
+        self.norm3 = nn.LayerNorm(config.hidden_size)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(
+        self,
+        tgt: torch.Tensor,  # (B,K,D)
+        memory: torch.Tensor,  # (B,L,D)
+        memory_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # self-attn
+        residual = tgt
+        tgt2, _ = self.self_attn(tgt, tgt, tgt)
+        tgt = residual + self.dropout(tgt2)
+        tgt = self.norm1(tgt)
+        # cross-attn
+        residual = tgt
+        tgt2, _ = self.cross_attn(
+            query=tgt,
+            key=memory,
+            value=memory,
+            key_padding_mask=memory_mask,
+        )
+        tgt = residual + self.dropout(tgt2)
+        tgt = self.norm2(tgt)
+        # FFN
+        residual = tgt
+        tgt2 = self.linear2(F.gelu(self.linear1(tgt)))
+        tgt = residual + self.dropout(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+class DetrOvdHead(nn.Module):
+    """
+    Unified OVD head:
+    - head_type="small": single cross-attention pooling (fast)
+    - head_type="decoder": DETR-style decoder stack (heavier, experimental)
+    """
+    def __init__(self, config: object) -> None:
+        super().__init__()
+        self.config = config
+        self.num_queries = int(getattr(config, "num_queries"))
+        self.d_model = int(getattr(config, "hidden_size"))
+        head_type = getattr(config, "head_type", "small")
+        self.head_type = str(head_type)
+        self.query_embed = nn.Embedding(self.num_queries, self.d_model)
+        if self.head_type == "detr":
+            n_layers = int(getattr(config, "num_decoder_layers"))
+            self.layers = nn.ModuleList([_DetrDecoderLayer(config) for _ in range(n_layers)])
+            self.pooling = None
+        else:
+            # default: "small"
+            self.pooling = _CrossAttentionLayer(config)
+            self.layers = None
+        self.bbox_head = nn.Sequential(
+            nn.Linear(self.d_model, self.d_model),
+            nn.ReLU(),
+            nn.Linear(self.d_model, 4),
+        )
+        self.score_head = nn.Sequential(
+            nn.Linear(self.d_model, self.d_model),
+            nn.ReLU(),
+            nn.Linear(self.d_model, 1),
+        )
+    def forward(
+        self,
+        memory: torch.Tensor,  # (B,L,D)
+        memory_mask: torch.Tensor | None = None,  # (B,L) or None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        B, _, _ = memory.shape
+        device = memory.device
+        # object queries
+        query_idx = torch.arange(self.num_queries, device=device)
+        tgt = self.query_embed(query_idx).unsqueeze(0).expand(B, -1, -1)  # (B,K,D)
+        if self.head_type == "detr":
+            assert self.layers is not None
+            for layer in self.layers:
+                tgt = layer(tgt, memory, memory_mask)
+        else:
+            assert self.pooling is not None
+            tgt = self.pooling(tgt, memory, memory_mask)
+        # Predict (cx, cy, w, h) in [0, 1] range
+        pred_cxcywh = self.bbox_head(tgt).sigmoid()  # (B,K,4), 0~1
+        # Convert to (x1, y1, x2, y2) format
+        cx, cy, w, h = pred_cxcywh.unbind(-1)
+        pred_boxes = torch.stack(
+            [
+                cx - w / 2,  # x1
+                cy - h / 2,  # y1
+                cx + w / 2,  # x2
+                cy + h / 2,  # y2
+            ],
+            dim=-1,
+        ).clamp(0, 1)
+        pred_logits = self.score_head(tgt)  # (B,K,1), raw logits for BCE with logits
+        return pred_boxes, pred_logits

hungarian_matcher.py ADDED Viewed

	@@ -0,0 +1,742 @@

+"""
+Hungarian Matcher and Loss Functions for OVD Training
+Based on DETR's bipartite matching approach.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from scipy.optimize import linear_sum_assignment
+def generalized_box_iou(boxes1: torch.Tensor, boxes2: torch.Tensor) -> torch.Tensor:
+    """
+    Compute generalized IoU between two sets of boxes.
+    Args:
+        boxes1: (N, 4) in (x1, y1, x2, y2) format
+        boxes2: (M, 4) in (x1, y1, x2, y2) format
+    Returns:
+        giou: (N, M) GIoU matrix
+    """
+    # Compute intersection
+    lt = torch.max(boxes1[:, None, :2], boxes2[None, :, :2])  # (N, M, 2)
+    rb = torch.min(boxes1[:, None, 2:], boxes2[None, :, 2:])  # (N, M, 2)
+    wh = (rb - lt).clamp(min=0)  # (N, M, 2)
+    inter = wh[:, :, 0] * wh[:, :, 1]  # (N, M)
+    # Compute areas
+    area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])  # (N,)
+    area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])  # (M,)
+    union = area1[:, None] + area2[None, :] - inter  # (N, M)
+    eps = 1e-6
+    iou = inter / (union + eps)
+    # Compute enclosing box
+    lt_enc = torch.min(boxes1[:, None, :2], boxes2[None, :, :2])  # (N, M, 2)
+    rb_enc = torch.max(boxes1[:, None, 2:], boxes2[None, :, 2:])  # (N, M, 2)
+    wh_enc = (rb_enc - lt_enc).clamp(min=0)  # (N, M, 2)
+    area_enc = wh_enc[..., 0] * wh_enc[..., 1]  # (N, M)
+    giou = iou - (area_enc - union) / (area_enc + eps)
+    return giou
+def generalized_box_iou_pairwise(boxes1: torch.Tensor, boxes2: torch.Tensor) -> torch.Tensor:
+    """
+    Element-wise generalized IoU between two sets of boxes.
+    Args:
+        boxes1: (N, 4) in (x1, y1, x2, y2)
+        boxes2: (N, 4) in (x1, y1, x2, y2)
+    Returns:
+        giou: (N,) GIoU for each pair
+    """
+    assert boxes1.shape == boxes2.shape
+    # Intersection
+    lt = torch.max(boxes1[:, :2], boxes2[:, :2])  # (N, 2)
+    rb = torch.min(boxes1[:, 2:], boxes2[:, 2:])  # (N, 2)
+    wh = (rb - lt).clamp(min=0)  # (N, 2)
+    inter = wh[:, 0] * wh[:, 1]  # (N,)
+    # Areas
+    area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])  # (N,)
+    area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])  # (N,)
+    union = area1 + area2 - inter  # (N,)
+    eps = 1e-6
+    iou = inter / (union + eps)
+    # Enclosing box
+    lt_enc = torch.min(boxes1[:, :2], boxes2[:, :2])  # (N, 2)
+    rb_enc = torch.max(boxes1[:, 2:], boxes2[:, 2:])  # (N, 2)
+    wh_enc = (rb_enc - lt_enc).clamp(min=0)  # (N, 2)
+    area_enc = wh_enc[:, 0] * wh_enc[:, 1]  # (N,)
+    giou = iou - (area_enc - union) / (area_enc + eps)
+    return giou
+class HungarianMatcher(nn.Module):
+    """
+    Hungarian Matcher for bipartite matching between predictions and targets.
+    """
+    def __init__(
+        self,
+        cost_bbox: float = 5.0,
+        cost_giou: float = 2.0,
+        cost_class: float = 1.0,
+    ) -> None:
+        super().__init__()
+        self.cost_bbox = cost_bbox
+        self.cost_giou = cost_giou
+        self.cost_class = cost_class
+    @torch.no_grad()
+    def forward(
+        self,
+        pred_boxes: torch.Tensor,
+        pred_scores: torch.Tensor,
+        target_boxes: torch.Tensor,
+        target_mask: torch.Tensor,
+    ) -> list[tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Compute bipartite matching between predictions and targets.
+        Args:
+            pred_boxes: (B, num_queries, 4) predicted boxes in [x1, y1, x2, y2] format
+            pred_scores: (B, num_queries, 1) predicted objectness scores (logits)
+            target_boxes: (B, max_targets, 4) target boxes
+            target_mask: (B, max_targets) True for valid targets
+        Returns:
+            List of (pred_indices, target_indices) tuples for each batch
+        """
+        B, num_queries, _ = pred_boxes.shape
+        indices: list[tuple[torch.Tensor, torch.Tensor]] = []
+        for b in range(B):
+            valid_mask = target_mask[b]  # (max_targets,)
+            num_targets = int(valid_mask.sum().item())
+            if num_targets == 0:
+                # No targets, return empty matching
+                device = pred_boxes.device
+                indices.append(
+                    (
+                        torch.empty(0, dtype=torch.long, device=device),
+                        torch.empty(0, dtype=torch.long, device=device),
+                    )
+                )
+                continue
+            # Use mask indices instead of assuming prefix padding
+            valid_idx = valid_mask.nonzero(as_tuple=False).squeeze(-1)  # (num_targets,)
+            tgt_boxes = target_boxes[b, valid_idx]  # (num_targets, 4)
+            src_boxes = pred_boxes[b]  # (num_queries, 4)
+            src_scores = pred_scores[b].squeeze(-1).sigmoid()  # (num_queries,)
+            # Cast to float32 for cdist (bfloat16 not supported)
+            src_boxes_f32 = src_boxes.float()
+            tgt_boxes_f32 = tgt_boxes.float()
+            # L1 cost
+            cost_bbox = torch.cdist(src_boxes_f32, tgt_boxes_f32, p=1)  # (num_queries, num_targets)
+            # GIoU cost
+            cost_giou = -generalized_box_iou(src_boxes_f32, tgt_boxes_f32)  # (num_queries, num_targets)
+            # Classification cost (negative score for foreground)
+            cost_class = -src_scores.unsqueeze(1).expand(-1, num_targets)  # (num_queries, num_targets)
+            # Total cost
+            C = (
+                self.cost_bbox * cost_bbox
+                + self.cost_giou * cost_giou
+                + self.cost_class * cost_class
+            )
+            # Hungarian matching (on CPU)
+            C_np = C.cpu().numpy()
+            pred_idx, tgt_local_idx = linear_sum_assignment(C_np)
+            # Map local target indices back to original indices
+            valid_idx = valid_idx.to(pred_boxes.device)
+            tgt_idx = valid_idx[tgt_local_idx]
+            device = pred_boxes.device
+            indices.append(
+                (
+                    torch.as_tensor(pred_idx, dtype=torch.long, device=device),
+                    tgt_idx.to(device=device, dtype=torch.long),
+                )
+            )
+        return indices
+def sigmoid_focal_loss(
+    inputs: torch.Tensor,
+    targets: torch.Tensor,
+    alpha: float = 0.25,
+    gamma: float = 2.0,
+    reduction: str = "none",
+) -> torch.Tensor:
+    """
+    Focal Loss for dense classification.
+    Args:
+        inputs: logits (before sigmoid)
+        targets: binary targets (0 or 1)
+        alpha: weighting factor for positive class
+        gamma: focusing parameter (higher = more focus on hard examples)
+        reduction: 'none', 'mean', or 'sum'
+    """
+    p = torch.sigmoid(inputs)
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = p * targets + (1 - p) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+    # Apply alpha weighting
+    alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+    loss = alpha_t * loss
+    if reduction == "mean":
+        return loss.mean()
+    elif reduction == "sum":
+        return loss.sum()
+    return loss
+class OvdCriterion(nn.Module):
+    """
+    Loss criterion for OVD training with Hungarian matching.
+    """
+    def __init__(
+        self,
+        matcher: HungarianMatcher,
+        loss_bbox: float = 5.0,
+        loss_giou: float = 2.0,
+        loss_cls: float = 1.0,
+        eos_coef: float = 0.1,  # Weight for no-object class (BCE mode only)
+        use_focal_loss: bool = True,  # Use focal loss instead of BCE
+        focal_alpha: float = 0.75,  # Higher weight for positive (sparse in OVD)
+        focal_gamma: float = 2.0,
+    ) -> None:
+        super().__init__()
+        self.matcher = matcher
+        self.loss_bbox_weight = loss_bbox
+        self.loss_giou_weight = loss_giou
+        self.loss_cls_weight = loss_cls
+        self.eos_coef = eos_coef
+        self.use_focal_loss = use_focal_loss
+        self.focal_alpha = focal_alpha
+        self.focal_gamma = focal_gamma
+    def loss_boxes(
+        self,
+        pred_boxes: torch.Tensor,
+        target_boxes: torch.Tensor,
+        indices: list[tuple[torch.Tensor, torch.Tensor]],
+        num_boxes: int,
+    ) -> dict[str, torch.Tensor]:
+        """
+        Compute bounding box losses (L1 + GIoU).
+        """
+        src_boxes_list: list[torch.Tensor] = []
+        tgt_boxes_list: list[torch.Tensor] = []
+        for b, (src_idx, tgt_idx) in enumerate(indices):
+            if src_idx.numel() > 0:
+                src_boxes_list.append(pred_boxes[b, src_idx])
+                tgt_boxes_list.append(target_boxes[b, tgt_idx])
+        if not src_boxes_list:
+            device = pred_boxes.device
+            return {
+                "loss_bbox": torch.tensor(0.0, device=device),
+                "loss_giou": torch.tensor(0.0, device=device),
+            }
+        src_boxes_all = torch.cat(src_boxes_list, dim=0).float()  # (N, 4)
+        tgt_boxes_all = torch.cat(tgt_boxes_list, dim=0).float()  # (N, 4)
+        # L1 loss
+        loss_bbox = F.l1_loss(src_boxes_all, tgt_boxes_all, reduction="sum") / max(num_boxes, 1)
+        # Element-wise GIoU loss (avoid building full N x N matrix)
+        giou = generalized_box_iou_pairwise(src_boxes_all, tgt_boxes_all)  # (N,)
+        loss_giou = (1.0 - giou).sum() / max(num_boxes, 1)
+        return {
+            "loss_bbox": loss_bbox,
+            "loss_giou": loss_giou,
+        }
+    def loss_labels(
+        self,
+        pred_scores: torch.Tensor,
+        target_mask: torch.Tensor,
+        indices: list[tuple[torch.Tensor, torch.Tensor]],
+        num_boxes: int,
+    ) -> dict[str, torch.Tensor]:
+        """
+        Compute classification loss (objectness).
+        """
+        B, num_queries, _ = pred_scores.shape
+        device = pred_scores.device
+        # Create target labels: 1 for matched queries, 0 for unmatched
+        target_labels = torch.zeros(B, num_queries, dtype=torch.float32, device=device)
+        for b, (src_idx, _) in enumerate(indices):
+            if src_idx.numel() > 0:
+                target_labels[b, src_idx] = 1.0
+        pred_logits = pred_scores.squeeze(-1)
+        if self.use_focal_loss:
+            # Focal Loss: focuses on hard examples, handles class imbalance better
+            loss_per_query = sigmoid_focal_loss(
+                pred_logits,
+                target_labels,
+                alpha=self.focal_alpha,
+                gamma=self.focal_gamma,
+                reduction="none",
+            )
+        else:
+            # BCE with manual weighting for no-object class
+            loss_per_query = F.binary_cross_entropy_with_logits(
+                pred_logits,
+                target_labels,
+                reduction="none",
+            )
+        pos_mask = target_labels == 1
+        neg_mask = ~pos_mask
+        pos_count = max(int(pos_mask.sum().item()), 1)
+        neg_count = max(int(neg_mask.sum().item()), 1)
+        pos_loss = loss_per_query[pos_mask].sum() / pos_count
+        neg_loss = loss_per_query[neg_mask].sum() / neg_count
+        loss_cls = pos_loss + self.eos_coef * neg_loss
+        return {"loss_cls": loss_cls}
+    def forward(
+        self,
+        pred_boxes: torch.Tensor,
+        pred_scores: torch.Tensor,
+        target_boxes: torch.Tensor,
+        target_mask: torch.Tensor,
+    ) -> dict[str, torch.Tensor]:
+        """
+        Compute total loss.
+        Args:
+            pred_boxes: (B, num_queries, 4) predicted boxes
+            pred_scores: (B, num_queries, 1) predicted objectness scores (logits)
+            target_boxes: (B, max_targets, 4) target boxes
+            target_mask: (B, max_targets) True for valid targets
+        Returns:
+            Dictionary of losses
+        """
+        # Compute matching
+        indices = self.matcher(pred_boxes, pred_scores, target_boxes, target_mask)
+        # Count total number of target boxes for normalization
+        num_boxes = int(target_mask.sum().item())
+        num_boxes = max(num_boxes, 1)
+        # Box losses
+        box_losses = self.loss_boxes(pred_boxes, target_boxes, indices, num_boxes)
+        # Classification losses
+        cls_losses = self.loss_labels(pred_scores, target_mask, indices, num_boxes)
+        losses: dict[str, torch.Tensor] = {}
+        losses.update(box_losses)
+        losses.update(cls_losses)
+        # Total loss
+        losses["loss_total"] = (
+            self.loss_bbox_weight * losses["loss_bbox"]
+            + self.loss_giou_weight * losses["loss_giou"]
+            + self.loss_cls_weight * losses["loss_cls"]
+        )
+        return losses
+class BboxOnlyCriterion(nn.Module):
+    """
+    Loss criterion that only uses bounding box losses (L1 + GIoU) with Hungarian matching.
+    No classification loss is computed.
+    """
+    def __init__(
+        self,
+        matcher: HungarianMatcher,
+        loss_bbox: float = 5.0,
+        loss_giou: float = 2.0,
+    ) -> None:
+        super().__init__()
+        self.matcher = matcher
+        self.loss_bbox_weight = loss_bbox
+        self.loss_giou_weight = loss_giou
+    def loss_boxes(
+        self,
+        pred_boxes: torch.Tensor,
+        target_boxes: torch.Tensor,
+        indices: list[tuple[torch.Tensor, torch.Tensor]],
+        num_boxes: int,
+    ) -> dict[str, torch.Tensor]:
+        """
+        Compute bounding box losses (L1 + GIoU).
+        """
+        src_boxes_list: list[torch.Tensor] = []
+        tgt_boxes_list: list[torch.Tensor] = []
+        for b, (src_idx, tgt_idx) in enumerate(indices):
+            if src_idx.numel() > 0:
+                src_boxes_list.append(pred_boxes[b, src_idx])
+                tgt_boxes_list.append(target_boxes[b, tgt_idx])
+        if not src_boxes_list:
+            device = pred_boxes.device
+            return {
+                "loss_bbox": torch.tensor(0.0, device=device),
+                "loss_giou": torch.tensor(0.0, device=device),
+            }
+        src_boxes_all = torch.cat(src_boxes_list, dim=0).float()  # (N, 4)
+        tgt_boxes_all = torch.cat(tgt_boxes_list, dim=0).float()  # (N, 4)
+        # L1 loss
+        loss_bbox = F.l1_loss(src_boxes_all, tgt_boxes_all, reduction="sum") / max(num_boxes, 1)
+        # Element-wise GIoU loss (avoid building full N x N matrix)
+        giou = generalized_box_iou_pairwise(src_boxes_all, tgt_boxes_all)  # (N,)
+        loss_giou = (1.0 - giou).sum() / max(num_boxes, 1)
+        return {
+            "loss_bbox": loss_bbox,
+            "loss_giou": loss_giou,
+        }
+    def forward(
+        self,
+        pred_boxes: torch.Tensor,
+        pred_scores: torch.Tensor,
+        target_boxes: torch.Tensor,
+        target_mask: torch.Tensor,
+    ) -> dict[str, torch.Tensor]:
+        """
+        Compute total loss (bbox only).
+        Args:
+            pred_boxes: (B, num_queries, 4) predicted boxes
+            pred_scores: (B, num_queries, 1) predicted objectness scores (logits) - used for matching only
+            target_boxes: (B, max_targets, 4) target boxes
+            target_mask: (B, max_targets) True for valid targets
+        Returns:
+            Dictionary of losses
+        """
+        # Compute matching (still uses Hungarian matcher for optimal assignment)
+        indices = self.matcher(pred_boxes, pred_scores, target_boxes, target_mask)
+        # Count total number of target boxes for normalization
+        num_boxes = int(target_mask.sum().item())
+        num_boxes = max(num_boxes, 1)
+        # Box losses only
+        box_losses = self.loss_boxes(pred_boxes, target_boxes, indices, num_boxes)
+        losses: dict[str, torch.Tensor] = {}
+        losses.update(box_losses)
+        # Set classification loss to zero for compatibility
+        device = pred_boxes.device
+        losses["loss_cls"] = torch.tensor(0.0, device=device)
+        # Total loss (bbox only)
+        losses["loss_total"] = (
+            self.loss_bbox_weight * losses["loss_bbox"]
+            + self.loss_giou_weight * losses["loss_giou"]
+        )
+        return losses
+def build_criterion(
+    cost_bbox: float = 5.0,
+    cost_giou: float = 2.0,
+    cost_class: float = 1.0,
+    loss_bbox: float = 5.0,
+    loss_giou: float = 2.0,
+    loss_cls: float = 1.0,
+    eos_coef: float = 0.1,
+    use_focal_loss: bool = True,
+    focal_alpha: float = 0.25,
+    focal_gamma: float = 2.0,
+    loss_mode: str = "hungarian",
+) -> OvdCriterion | BboxOnlyCriterion:
+    """
+    Build the loss criterion.
+    Args:
+        loss_mode: "hungarian" (full Hungarian matcher + all losses) or "bbox_only" (bbox loss only)
+    """
+    matcher = HungarianMatcher(
+        cost_bbox=cost_bbox,
+        cost_giou=cost_giou,
+        cost_class=cost_class,
+    )
+    if loss_mode == "bbox_only":
+        criterion = BboxOnlyCriterion(
+            matcher=matcher,
+            loss_bbox=loss_bbox,
+            loss_giou=loss_giou,
+        )
+    else:  # "hungarian" (default)
+        criterion = OvdCriterion(
+            matcher=matcher,
+            loss_bbox=loss_bbox,
+            loss_giou=loss_giou,
+            loss_cls=loss_cls,
+            eos_coef=eos_coef,
+            use_focal_loss=use_focal_loss,
+            focal_alpha=focal_alpha,
+            focal_gamma=focal_gamma,
+        )
+    return criterion
+def box_iou(boxes1: torch.Tensor, boxes2: torch.Tensor) -> torch.Tensor:
+    """
+    Compute IoU between two sets of boxes.
+    Args:
+        boxes1: (N, 4) in (x1, y1, x2, y2) format
+        boxes2: (N, 4) in (x1, y1, x2, y2) format (same N, element-wise)
+    Returns:
+        iou: (N,) IoU for each pair
+    """
+    # Intersection
+    lt = torch.max(boxes1[:, :2], boxes2[:, :2])
+    rb = torch.min(boxes1[:, 2:], boxes2[:, 2:])
+    wh = (rb - lt).clamp(min=0)
+    inter = wh[:, 0] * wh[:, 1]
+    # Union
+    area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
+    area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])
+    union = area1 + area2 - inter
+    return inter / (union + 1e-6)
+def box_iou_matrix(boxes1: torch.Tensor, boxes2: torch.Tensor) -> torch.Tensor:
+    """
+    Pairwise IoU between two sets of boxes.
+    Args:
+        boxes1: (N, 4) in (x1, y1, x2, y2) format
+        boxes2: (M, 4) in (x1, y1, x2, y2) format
+    Returns:
+        iou: (N, M)
+    """
+    if boxes1.numel() == 0 or boxes2.numel() == 0:
+        return torch.zeros((boxes1.shape[0], boxes2.shape[0]), device=boxes1.device, dtype=boxes1.dtype)
+    lt = torch.max(boxes1[:, None, :2], boxes2[None, :, :2])  # (N, M, 2)
+    rb = torch.min(boxes1[:, None, 2:], boxes2[None, :, 2:])  # (N, M, 2)
+    wh = (rb - lt).clamp(min=0)
+    inter = wh[..., 0] * wh[..., 1]
+    area1 = (boxes1[:, 2] - boxes1[:, 0]).clamp(min=0) * (boxes1[:, 3] - boxes1[:, 1]).clamp(min=0)  # (N,)
+    area2 = (boxes2[:, 2] - boxes2[:, 0]).clamp(min=0) * (boxes2[:, 3] - boxes2[:, 1]).clamp(min=0)  # (M,)
+    union = area1[:, None] + area2[None, :] - inter
+    eps = 1e-6
+    return inter / (union + eps)
+def _ap_101(recalls: torch.Tensor, precisions: torch.Tensor) -> float:
+    """
+    COCO-style AP approximation by 101-point interpolation.
+    Expects recalls to be non-decreasing.
+    """
+    if recalls.numel() == 0:
+        return 0.0
+    recalls = recalls.clamp(0, 1)
+    precisions = precisions.clamp(0, 1)
+    # Precision envelope (monotone decreasing)
+    mpre = precisions.clone()
+    for i in range(mpre.numel() - 2, -1, -1):
+        mpre[i] = torch.maximum(mpre[i], mpre[i + 1])
+    ap = 0.0
+    for r in torch.linspace(0, 1, 101, device=recalls.device, dtype=recalls.dtype):
+        idx = torch.searchsorted(recalls, r)
+        if idx < mpre.numel():
+            ap += float(mpre[idx].item())
+    return ap / 101.0
+@torch.no_grad()
+def compute_metrics(
+    pred_boxes: torch.Tensor,
+    pred_scores: torch.Tensor,
+    target_boxes: torch.Tensor,
+    target_mask: torch.Tensor,
+    iou_thresholds: list[float] | None = None,
+    score_threshold: float = 0.5,
+) -> dict[str, float]:
+    """
+    Compute detection-style metrics for OVD (single-class).
+    Notes:
+    - Computes COCO-style mAP over IoU thresholds [0.50:0.95:0.05].
+    - Computes AP50/AP75.
+    - Computes precision/recall at IoU=0.50 with a fixed score threshold.
+    - `matcher` is kept for backward compatibility but is not used for AP computation.
+    Args:
+        pred_boxes: (B, num_queries, 4) predicted boxes
+        pred_scores: (B, num_queries, 1) predicted objectness scores (logits)
+        target_boxes: (B, max_targets, 4) target boxes
+        target_mask: (B, max_targets) True for valid targets
+        iou_thresholds: IoU thresholds for recall calculation
+        score_threshold: Score threshold for objectness accuracy
+    Returns:
+        Dictionary of metrics
+    """
+    if iou_thresholds is None:
+        iou_thresholds = [0.5, 0.75]
+    B = pred_boxes.shape[0]
+    # COCO mAP thresholds
+    coco_thresholds = [round(x, 2) for x in torch.arange(0.5, 0.96, 0.05).tolist()]
+    total_gt = int(target_mask.sum().item())
+    metrics: dict[str, float] = {
+        "num_gt": float(total_gt),
+        "num_queries": float(pred_boxes.shape[1]),
+    }
+    if total_gt == 0:
+        metrics.update({"mAP": 0.0, "AP50": 0.0, "AP75": 0.0, "precision@0.5": 0.0, "recall@0.5": 0.0})
+        return metrics
+    # For each IoU threshold, collect (score, is_tp) across the batch.
+    thr_to_scores: dict[float, list[torch.Tensor]] = {thr: [] for thr in coco_thresholds}
+    thr_to_is_tp: dict[float, list[torch.Tensor]] = {thr: [] for thr in coco_thresholds}
+    # Also track precision/recall at IoU=0.5 using a fixed score threshold.
+    pr_iou_thr = 0.5
+    pr_tp = 0
+    pr_fp = 0
+    for b in range(B):
+        gt_mask_b = target_mask[b]
+        if gt_mask_b.sum() == 0:
+            continue
+        gt_boxes = target_boxes[b, gt_mask_b].float()  # (G, 4)
+        scores = pred_scores[b].squeeze(-1).sigmoid().float()  # (Q,)
+        boxes = pred_boxes[b].float()  # (Q, 4)
+        order = torch.argsort(scores, descending=True)
+        scores_sorted = scores[order]
+        boxes_sorted = boxes[order]
+        # Precompute IoU matrix (Q, G) once per image
+        ious_qg = box_iou_matrix(boxes_sorted, gt_boxes)  # (Q, G)
+        for thr in coco_thresholds:
+            matched_gt = torch.zeros((gt_boxes.shape[0],), dtype=torch.bool, device=ious_qg.device)
+            is_tp = torch.zeros((boxes_sorted.shape[0],), dtype=torch.bool, device=ious_qg.device)
+            for i in range(boxes_sorted.shape[0]):
+                if gt_boxes.shape[0] == 0:
+                    break
+                ious_i = ious_qg[i]  # (G,)
+                max_iou, max_j = torch.max(ious_i, dim=0)
+                if float(max_iou.item()) >= thr and not bool(matched_gt[max_j].item()):
+                    is_tp[i] = True
+                    matched_gt[max_j] = True
+            thr_to_scores[thr].append(scores_sorted)
+            thr_to_is_tp[thr].append(is_tp)
+            # PR at IoU=0.5 with score threshold (count TP/FP after matching)
+            if thr == pr_iou_thr:
+                keep = scores_sorted >= score_threshold
+                pr_tp += int(is_tp[keep].sum().item())
+                pr_fp += int((~is_tp[keep]).sum().item())
+    # Compute APs
+    aps: list[float] = []
+    ap50 = 0.0
+    ap75 = 0.0
+    for thr in coco_thresholds:
+        if not thr_to_scores[thr]:
+            aps.append(0.0)
+            continue
+        scores_all = torch.cat(thr_to_scores[thr], dim=0)
+        is_tp_all = torch.cat(thr_to_is_tp[thr], dim=0).to(dtype=torch.float32)
+        # Sort globally by score
+        global_order = torch.argsort(scores_all, descending=True)
+        is_tp_all = is_tp_all[global_order]
+        is_fp_all = 1.0 - is_tp_all
+        cum_tp = torch.cumsum(is_tp_all, dim=0)
+        cum_fp = torch.cumsum(is_fp_all, dim=0)
+        recalls = cum_tp / max(float(total_gt), 1.0)
+        precisions = cum_tp / torch.clamp(cum_tp + cum_fp, min=1.0)
+        ap = _ap_101(recalls, precisions)
+        aps.append(ap)
+        if abs(thr - 0.5) < 1e-9:
+            ap50 = ap
+        if abs(thr - 0.75) < 1e-9:
+            ap75 = ap
+    metrics["mAP"] = float(sum(aps) / max(len(aps), 1))
+    metrics["AP50"] = float(ap50)
+    metrics["AP75"] = float(ap75)
+    precision = pr_tp / max(pr_tp + pr_fp, 1)
+    recall = pr_tp / max(total_gt, 1)
+    metrics["precision@0.5"] = float(precision)
+    metrics["recall@0.5"] = float(recall)
+    # Keep backward compatible recalls for common IoU thresholds at the fixed score threshold.
+    # (Same definition as above but for additional IoU thresholds if requested.)
+    for thr in iou_thresholds:
+        if thr not in coco_thresholds:
+            continue
+        if not thr_to_scores[thr]:
+            metrics[f"recall@{thr}"] = 0.0
+            continue
+        scores_all = torch.cat(thr_to_scores[thr], dim=0)
+        is_tp_all = torch.cat(thr_to_is_tp[thr], dim=0)
+        keep = scores_all >= score_threshold
+        tp = int(is_tp_all[keep].sum().item())
+        metrics[f"recall@{thr}"] = float(tp / max(total_gt, 1))
+    return metrics

internvl_image_procesing.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import torch
+from PIL import Image
+from torchvision import transforms as T
+from torchvision.transforms import InterpolationMode
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+def load_image(image_file, input_size=448, max_num=12):
+    if isinstance(image_file, str):
+        image = Image.open(image_file).convert('RGB')
+    else:
+        image = image_file.convert('RGB')
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values

internvl_ovd.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+from .heads import DetrOvdHead
+from .vlmbackbone import InternVL3_5_Backbone
+from torch import nn
+class InternVL3_5_OvdModel(nn.Module):
+    def __init__(
+        self,
+        backbone: InternVL3_5_Backbone,
+        model_config: object,
+    ) -> None:
+        super().__init__()
+        self.backbone = backbone
+        self.ovd_head = DetrOvdHead(model_config)
+        # Keep head dtype aligned with backbone output for non-autocast inference paths.
+        self.ovd_head.to(dtype=self.backbone.dtype)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        patch_mask: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Forward pass.
+        Args:
+            pixel_values: Image tensor
+            input_ids: Tokenized prompt
+            attention_mask: Attention mask for prompt
+        """
+        memory, padding_mask = self.backbone.forward_fused(
+            pixel_values,
+            input_ids,
+            attention_mask,
+            patch_mask=patch_mask,
+        )
+        pred_boxes, pred_scores = self.ovd_head(memory, padding_mask)
+        return pred_boxes, pred_scores
+def build_internvl_ovd(
+    model_config: object,
+    device: str,
+    dtype: torch.dtype,
+) -> InternVL3_5_OvdModel:
+    backbone = InternVL3_5_Backbone(
+        model_config.vlm_model_name,
+        device,
+        dtype,
+        use_token_fpn=model_config.use_token_fpn,
+        token_fpn_levels=model_config.token_fpn_levels,
+        token_fpn_include_text=model_config.token_fpn_include_text,
+    )
+    model = InternVL3_5_OvdModel(backbone, model_config)
+    return model

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0edac953471304a8759724dec3616d8010d51f588789ee3a8162c53e373312be
+size 2140804602

modeling_internvl_ovd.py ADDED Viewed

	@@ -0,0 +1,187 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Optional
+import torch
+from PIL import Image
+from transformers import PreTrainedModel
+from transformers.utils import ModelOutput
+from .hungarian_matcher import build_criterion
+from .configuration_internvl_ovd import InternVLOVDConfig
+from .internvl_ovd import build_internvl_ovd
+from .internvl_image_procesing import load_image
+@dataclass
+class InternVLOVDOutput(ModelOutput):
+    loss: torch.Tensor | None = None
+    pred_boxes: torch.Tensor | None = None
+    pred_scores: torch.Tensor | None = None
+    loss_total: torch.Tensor | None = None
+    loss_bbox: torch.Tensor | None = None
+    loss_giou: torch.Tensor | None = None
+    loss_cls: torch.Tensor | None = None
+class InternVLOVDForDetection(PreTrainedModel):
+    config_class = InternVLOVDConfig
+    def __init__(self, config: InternVLOVDConfig) -> None:
+        super().__init__(config)
+        amp_dtype = torch.bfloat16 if config.dtype == "bfloat16" else torch.float16
+        self.inner = build_internvl_ovd(
+            model_config=config,
+            device=config.device_map,
+            dtype=amp_dtype,
+        )
+        if config.freeze_backbone:
+            for name, param in self.inner.named_parameters():
+                if name.startswith("backbone.vlm.vision_model"):
+                    param.requires_grad = False
+        # Training criterion is created lazily to keep Hub inference-only loads minimal.
+        self._criterion = None
+    @property
+    def criterion(self) -> torch.nn.Module:
+        if self._criterion is None:
+            cfg = self.config
+            self._criterion = build_criterion(
+                cost_bbox=cfg.cost_bbox,
+                cost_giou=cfg.cost_giou,
+                cost_class=cfg.cost_class,
+                loss_bbox=cfg.loss_bbox,
+                loss_giou=cfg.loss_giou,
+                loss_cls=cfg.loss_cls,
+                eos_coef=cfg.eos_coef,
+                use_focal_loss=cfg.use_focal_loss,
+                focal_alpha=cfg.focal_alpha,
+                focal_gamma=cfg.focal_gamma,
+                loss_mode=cfg.loss_mode,
+            )
+        return self._criterion
+    def forward_inference(
+        self,
+        *,
+        pixel_values: torch.Tensor,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        patch_mask: Optional[torch.Tensor] = None,
+        **kwargs: Any,
+    ) -> InternVLOVDOutput:
+        if hasattr(self.inner.backbone.vlm, "vision_model"):
+            self.inner.backbone.vlm.vision_model.eval()
+        pred_boxes, pred_scores = self.inner(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            patch_mask=patch_mask,
+        )
+        return InternVLOVDOutput(loss=None, pred_boxes=pred_boxes, pred_scores=pred_scores)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        patch_mask: Optional[torch.Tensor] = None,
+        boxes: Optional[torch.Tensor] = None,
+        box_mask: Optional[torch.Tensor] = None,
+        compute_loss: bool = False,
+        **kwargs: Any,
+    ) -> InternVLOVDOutput:
+        outputs = self.forward_inference(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            patch_mask=patch_mask,
+            **kwargs,
+        )
+        if not compute_loss:
+            return outputs
+        if boxes is None or box_mask is None:
+            raise ValueError("compute_loss=True requires both `boxes` and `box_mask`.")
+        pred_boxes = outputs.pred_boxes
+        pred_scores = outputs.pred_scores
+        losses = self.criterion(pred_boxes, pred_scores, boxes, box_mask)
+        loss_total = losses.get("loss_total")
+        return InternVLOVDOutput(
+            loss=loss_total,
+            pred_boxes=pred_boxes,
+            pred_scores=pred_scores,
+            loss_total=loss_total,
+            loss_bbox=losses.get("loss_bbox"),
+            loss_giou=losses.get("loss_giou"),
+            loss_cls=losses.get("loss_cls"),
+        )
+    @torch.no_grad()
+    def infer_image(
+        self,
+        *,
+        image: Image.Image | str,
+        query: str,
+        tokenizer,
+        max_length: int = 4096,
+        device: Optional[torch.device] = None,
+    ) -> InternVLOVDOutput:
+        """
+        Convenience inference helper that accepts a PIL image (or path) + query text.
+        Handles image preprocessing and prompt construction.
+        """
+        cfg = self.config
+        if device is None:
+            device = next(self.parameters()).device
+        amp_dtype = torch.bfloat16 if cfg.dtype == "bfloat16" else torch.float16
+        if device.type == "cpu" and amp_dtype == torch.float16:
+            amp_dtype = torch.bfloat16
+        pixel_values = load_image(image, input_size=cfg.input_size, max_num=cfg.max_num_patches)
+        num_patches = int(pixel_values.shape[0])
+        pixel_values = pixel_values.unsqueeze(0)
+        patch_mask = torch.ones((1, num_patches), dtype=torch.bool)
+        img_context_token = "<IMG_CONTEXT>"
+        img_start_token = "<img>"
+        img_end_token = "</img>"
+        tokens_per_patch = 256
+        image_tokens = img_start_token + img_context_token * (tokens_per_patch * num_patches) + img_end_token
+        prompt = (
+            f"{image_tokens}\n"
+            "Please provide the bounding box coordinate of the region this sentence describes: "
+            f"<ref>{query}</ref>"
+        )
+        tokens = tokenizer(
+            [prompt],
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=max_length,
+        )
+        pixel_values = pixel_values.to(device=device, dtype=amp_dtype)
+        patch_mask = patch_mask.to(device=device)
+        input_ids = tokens["input_ids"].to(device=device)
+        attention_mask = tokens["attention_mask"].to(device=device)
+        self.eval()
+        amp_device_type = "cuda" if device.type == "cuda" else "cpu"
+        with torch.amp.autocast(device_type=amp_device_type, dtype=amp_dtype):
+            return self.forward_inference(
+                pixel_values=pixel_values,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                patch_mask=patch_mask,
+            )

vlmbackbone.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import torch
+from torch import nn
+from transformers import AutoModel, AutoTokenizer
+import torch.nn.functional as F
+class VlmBackboneBase(nn.Module):
+    """
+    공통 VLM 비전 백본 인터페이스.
+    - forward_vision(pixel_values) -> (image_tokens, padding_mask)
+      image_tokens: (B, L, D)
+      padding_mask: (B, L)  (True == pad)
+    """
+    def __init__(self) -> None:
+        super().__init__()
+    def forward_fused(
+        self,
+        pixel_values: torch.Tensor,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        raise NotImplementedError
+class InternVL3_5_Backbone(VlmBackboneBase):
+    def __init__(
+        self,
+        model_name: str,
+        device: str,
+        dtype: torch.dtype,
+        *,
+        use_token_fpn: bool = False,
+        token_fpn_levels: tuple[int, ...] = (16, 8, 4, 2),
+        token_fpn_include_text: bool = True,
+    ) -> None:
+        super().__init__()
+        self.device = device
+        self.dtype = dtype
+        self.use_token_fpn = use_token_fpn
+        self.token_fpn_levels = token_fpn_levels
+        self.token_fpn_include_text = token_fpn_include_text
+        self.vlm = AutoModel.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+            torch_dtype=dtype,
+            low_cpu_mem_usage=False,
+            device_map=None,
+            _attn_implementation="flash_attention_2"
+        )
+        self.hidden_size_llm = 1024     # InternVL3_5 text hidden dim
+        self.hidden_size_detr = 1024    # DETR d_model
+        self.fused_proj = nn.Linear(
+            self.hidden_size_llm,
+            self.hidden_size_detr,
+            bias=True,
+            device=None,
+            dtype=dtype,
+        )
+        nn.init.eye_(self.fused_proj.weight)
+        nn.init.zeros_(self.fused_proj.bias)
+        # Set img_context_token_id for the model
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
+        IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"
+        self.img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
+        self.vlm.img_context_token_id = self.img_context_token_id
+    def _build_token_fpn_memory(
+        self,
+        memory_last: torch.Tensor,  # (B, T, D)
+        input_ids: torch.Tensor,    # (B, T)
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Build an "FPN-like" multi-level token memory from IMG_CONTEXT token embeddings.
+        - Extract IMG_CONTEXT tokens per sample
+        - Reshape per patch into (num_patches, 16, 16, D)
+        - Pool to multiple spatial levels (e.g., 16->8->4->2)
+        - Flatten and concatenate levels into one sequence
+        - Optionally append non-image tokens (text + special tokens)
+        Returns:
+            memory: (B, L, D)
+            padding_mask: (B, L) with True == pad
+        """
+        B, T, D = memory_last.shape
+        device = memory_last.device
+        # Validate levels (must be descending powers of 2 from 16)
+        levels = tuple(int(x) for x in self.token_fpn_levels)
+        if len(levels) == 0 or levels[0] != 16:
+            raise ValueError(f"token_fpn_levels must start with 16, got {levels}")
+        for a, b in zip(levels, levels[1:]):
+            if a % 2 != 0 or b != a // 2:
+                raise ValueError(f"token_fpn_levels must be like (16,8,4,2,...) got {levels}")
+        is_img = input_ids.eq(self.img_context_token_id)  # (B, T)
+        per_sample_memory: list[torch.Tensor] = []
+        max_len = 0
+        for i in range(B):
+            img_tokens = memory_last[i][is_img[i]]  # (N_img, D)
+            n_img = img_tokens.shape[0]
+            if n_img == 0:
+                # Fallback: no img tokens found -> keep original memory.
+                mem_i = memory_last[i]
+            else:
+                if n_img % 256 != 0:
+                    raise ValueError(f"IMG_CONTEXT token count must be multiple of 256, got {n_img}")
+                num_patches = n_img // 256
+                # (num_patches, D, 16, 16)
+                patch_feat = img_tokens.view(num_patches, 16, 16, D).permute(0, 3, 1, 2).contiguous()
+                # Build levels by pooling
+                level_tokens: list[torch.Tensor] = []
+                feat = patch_feat
+                cur = 16
+                for lvl in levels:
+                    # Ensure feat is at correct resolution
+                    while cur > lvl:
+                        feat = F.avg_pool2d(feat, kernel_size=2, stride=2)
+                        cur //= 2
+                    # Flatten: (num_patches, D, H, W) -> (num_patches*H*W, D)
+                    h, w = feat.shape[-2:]
+                    lvl_tok = feat.permute(0, 2, 3, 1).reshape(num_patches * h * w, D).contiguous()
+                    level_tokens.append(lvl_tok)
+                mem_i = torch.cat(level_tokens, dim=0)  # (L_img_fpn, D)
+                if self.token_fpn_include_text:
+                    txt_tokens = memory_last[i][~is_img[i]]  # (N_txt, D)
+                    mem_i = torch.cat([txt_tokens, mem_i], dim=0)
+            per_sample_memory.append(mem_i)
+            max_len = max(max_len, mem_i.shape[0])
+        # Pad to (B, max_len, D)
+        padded = memory_last.new_zeros((B, max_len, D))
+        padding_mask = torch.ones((B, max_len), device=device, dtype=torch.bool)
+        for i, mem_i in enumerate(per_sample_memory):
+            seq_len = mem_i.shape[0]
+            padded[i, :seq_len] = mem_i
+            padding_mask[i, :seq_len] = False
+        return padded, padding_mask
+    def forward_fused(
+        self,
+        pixel_values: torch.Tensor,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        patch_mask: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        # pixel_values: (B, P, 3, H, W)
+        if pixel_values.dim() == 5:
+            bsz, num_patches, channels, height, width = pixel_values.shape
+            pixel_values = pixel_values.view(-1, channels, height, width)
+            if patch_mask is not None:
+                patch_mask = patch_mask.view(bsz * num_patches)
+        else:
+            bsz = pixel_values.shape[0]
+            num_patches = 1
+            patch_mask = None
+        # image_flags must match number of images provided to the VLM
+        if patch_mask is not None:
+            image_flags = patch_mask.to(pixel_values.device, dtype=torch.long)
+        else:
+            image_flags = torch.ones(pixel_values.shape[0], dtype=torch.long, device=pixel_values.device)
+        outputs = self.vlm(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            image_flags=image_flags,
+            output_hidden_states=True,
+            return_dict=True
+        )
+        # CausalLMOutputWithPast has hidden_states tuple, last element is the final layer output
+        memory = outputs.hidden_states[-1]  # (B, T, hidden_size)
+        memory = self.fused_proj(memory)    # (B, T, hidden_size_detr)
+        if self.use_token_fpn:
+            memory, padding_mask = self._build_token_fpn_memory(
+                memory_last=memory,
+                input_ids=input_ids,
+            )
+            return memory, padding_mask
+        return memory, None