Argus v1.1: add trained FCOS detection head as the fifth task

Adds object detection to Argus via an FCOS-style anchor-free detector
built on a ViTDet-style simple feature pyramid, trained on COCO 2017
train (117,266 images, 80 classes) with the EUPE-ViT-B backbone frozen.

Detection results on COCO val2017 (5,000 images)
-------------------------------------------------
mAP@[0.5:0.95] = 41.0
mAP@0.50 = 64.8
mAP@0.75 = 43.2
mAP small/med/lg = 21.4 / 44.9 / 62.1

For context, FCOS with a fully-trained ResNet-50-FPN backbone achieves
39.1 mAP on the same benchmark. The frozen EUPE-ViT-B backbone exceeds
that baseline at 41.0 mAP while sharing its features with four other
task heads simultaneously.

Architecture
------------
The simple feature pyramid takes the backbone stride-16 spatial features
and synthesizes five levels (P3 through P7, strides 8 through 128) via
a transposed convolution for P3, identity with channel reduction for P4,
and chained stride-2 convolutions for P5-P7, each with 256 channels and
GroupNorm. Two shared four-layer conv towers (classification and
regression) with GroupNorm and GELU process each level. Three prediction
heads output 80 classification channels, 4 box regression channels
(left/top/right/bottom distances, exponentiated with learned per-level
scale), and 1 centerness channel. 16.14M trainable parameters total.

Training recipe
---------------
640px input with letterbox padding, batch 64, AdamW lr 1e-3, cosine
schedule with 3% warmup, weight decay 1e-4, gradient clipping at 10.0,
8 epochs, full FP32 throughout. Focal loss (alpha 0.25, gamma 2.0) for
classification, GIoU for boxes, BCE for centerness. ~6 hours wall clock
on a single RTX 6000 Ada at 0.7 it/s with 23 GB peak VRAM.

API
---
model.detect(image) returns a list of dicts:
[{"box": [x1,y1,x2,y2], "score": float, "label": int, "class_name": str}]

Detection uses a separate forward pass at 640px (the other tasks use
224/512/416), so it lives in its own method rather than in perceive().
Accepts single images or batches. Configurable score_thresh, nms_thresh,
and max_per_image.

Backward compatibility
----------------------
All existing methods (classify, segment, depth, perceive, correspond)
return identical results to v1.0. The detection head adds 16.14M
parameters and 62 MB to the checkpoint (334 MB to 396 MB). perceive()
does not include detection in its output.

Files changed
-------------
argus.py: +SimpleFeaturePyramid, +FCOSHead, +DetectionHead,
+detect() method, +_make_locations, +_decode_detections,
+_letterbox_to_square, +COCO_CLASSES, +FPN_STRIDES,
extended _init_weights for Conv2d/GroupNorm
model.safetensors: +79 detection_head.* tensors (334 MB to 396 MB)
config.json: +detection_num_classes, +detection_fpn_channels,
+detection_num_convs
README.md: detection in architecture diagram, mAP table,
detect() usage example, head specs, training details

Files changed (1) hide show

argus.py +351 -4

argus.py CHANGED Viewed

@@ -21,6 +21,7 @@ import torch.nn.functional as F
 import torch.nn.init
 from PIL import Image
 from torch import Tensor, nn
 from torchvision.transforms import v2
 from transformers import PretrainedConfig, PreTrainedModel
@@ -874,6 +875,254 @@ class DepthHead(nn.Module):
         return torch.einsum("bkhw,k->bhw", logit, bins).unsqueeze(1)
 # ===========================================================================
 # Argus model (transformers-compatible)
 # ===========================================================================
@@ -893,6 +1142,10 @@ class ArgusConfig(PretrainedConfig):
         num_imagenet_classes: int = 1000,
         class_ids: Optional[list] = None,
         class_names: Optional[list] = None,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -905,6 +1158,10 @@ class ArgusConfig(PretrainedConfig):
         self.num_imagenet_classes = num_imagenet_classes
         self.class_ids = class_ids or []
         self.class_names = class_names or []
 class Argus(PreTrainedModel):
@@ -939,16 +1196,33 @@ class Argus(PreTrainedModel):
             torch.zeros(config.num_imagenet_classes),
             persistent=True,
         )
         for p in self.backbone.parameters():
             p.requires_grad = False
         self.backbone.eval()
         self.seg_head.eval()
         self.depth_head.eval()
     def _init_weights(self, module):
-        # HF reallocates missing buffers with torch.empty() (uninitialized memory).
-        # Zero any buffer that came back NaN; leave loaded buffers untouched.
         if module is self:
             for name in ("class_prototypes", "class_logit_weight", "class_logit_bias"):
                 if hasattr(self, name):
@@ -989,9 +1263,12 @@ class Argus(PreTrainedModel):
         cls = F.normalize(cls, dim=-1)
         if method == "knn":
-            scores_full = cls @ self.class_prototypes.T  # cosine similarity in [-1, 1]
         elif method == "softmax":
-            logits = F.linear(cls, self.class_logit_weight, self.class_logit_bias)
             scores_full = F.softmax(logits, dim=-1)  # in [0, 1]
         else:
             raise ValueError(f"unknown classification method: {method!r} (expected 'knn' or 'softmax')")
@@ -1111,6 +1388,76 @@ class Argus(PreTrainedModel):
             preds.append([px / resolution * tw, py / resolution * th])
         return preds
     def perceive(self, image_or_images, return_confidence: bool = False):
         single, images = _normalize_image_input(image_or_images)

 import torch.nn.init
 from PIL import Image
 from torch import Tensor, nn
+from torchvision.ops import nms
 from torchvision.transforms import v2
 from transformers import PretrainedConfig, PreTrainedModel
         return torch.einsum("bkhw,k->bhw", logit, bins).unsqueeze(1)
+# ===========================================================================
+# Detection (FCOS with ViTDet-style simple feature pyramid)
+# ===========================================================================
+FPN_STRIDES = [8, 16, 32, 64, 128]
+COCO_CLASSES = [
+    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck",
+    "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench",
+    "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra",
+    "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+    "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove",
+    "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
+    "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
+    "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+    "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse",
+    "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
+    "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier",
+    "toothbrush",
+]
+class SimpleFeaturePyramid(nn.Module):
+    """ViTDet-style simple FPN: a single stride-16 ViT feature map -> P3..P7."""
+    def __init__(self, in_channels: int = 768, fpn_channels: int = 256):
+        super().__init__()
+        self.fpn_channels = fpn_channels
+        self.p3 = nn.Sequential(
+            nn.ConvTranspose2d(in_channels, in_channels, 2, stride=2),
+            nn.GroupNorm(32, in_channels),
+            nn.GELU(),
+            nn.Conv2d(in_channels, fpn_channels, 1),
+            nn.GroupNorm(32, fpn_channels),
+            nn.Conv2d(fpn_channels, fpn_channels, 3, padding=1),
+            nn.GroupNorm(32, fpn_channels),
+        )
+        self.p4 = nn.Sequential(
+            nn.Conv2d(in_channels, fpn_channels, 1),
+            nn.GroupNorm(32, fpn_channels),
+            nn.Conv2d(fpn_channels, fpn_channels, 3, padding=1),
+            nn.GroupNorm(32, fpn_channels),
+        )
+        self.p5 = nn.Sequential(
+            nn.Conv2d(in_channels, in_channels, 3, stride=2, padding=1),
+            nn.GroupNorm(32, in_channels),
+            nn.GELU(),
+            nn.Conv2d(in_channels, fpn_channels, 1),
+            nn.GroupNorm(32, fpn_channels),
+            nn.Conv2d(fpn_channels, fpn_channels, 3, padding=1),
+            nn.GroupNorm(32, fpn_channels),
+        )
+        self.p6 = nn.Sequential(
+            nn.Conv2d(fpn_channels, fpn_channels, 3, stride=2, padding=1),
+            nn.GroupNorm(32, fpn_channels),
+        )
+        self.p7 = nn.Sequential(
+            nn.Conv2d(fpn_channels, fpn_channels, 3, stride=2, padding=1),
+            nn.GroupNorm(32, fpn_channels),
+        )
+    def forward(self, x: Tensor) -> List[Tensor]:
+        p3 = self.p3(x)
+        p4 = self.p4(x)
+        p5 = self.p5(x)
+        p6 = self.p6(p5)
+        p7 = self.p7(p6)
+        return [p3, p4, p5, p6, p7]
+class FCOSHead(nn.Module):
+    """Shared classification / box regression / centerness towers across pyramid levels."""
+    def __init__(self, fpn_channels: int = 256, num_classes: int = 80, num_convs: int = 4):
+        super().__init__()
+        self.num_classes = num_classes
+        cls_tower, reg_tower = [], []
+        for _ in range(num_convs):
+            cls_tower += [
+                nn.Conv2d(fpn_channels, fpn_channels, 3, padding=1),
+                nn.GroupNorm(32, fpn_channels),
+                nn.GELU(),
+            ]
+            reg_tower += [
+                nn.Conv2d(fpn_channels, fpn_channels, 3, padding=1),
+                nn.GroupNorm(32, fpn_channels),
+                nn.GELU(),
+            ]
+        self.cls_tower = nn.Sequential(*cls_tower)
+        self.reg_tower = nn.Sequential(*reg_tower)
+        self.cls_pred = nn.Conv2d(fpn_channels, num_classes, 3, padding=1)
+        self.reg_pred = nn.Conv2d(fpn_channels, 4, 3, padding=1)
+        self.center_pred = nn.Conv2d(fpn_channels, 1, 3, padding=1)
+        self.scales = nn.Parameter(torch.ones(len(FPN_STRIDES)))
+        prior = 0.01
+        nn.init.constant_(self.cls_pred.bias, -math.log((1 - prior) / prior))
+        nn.init.zeros_(self.reg_pred.weight)
+        nn.init.zeros_(self.reg_pred.bias)
+        nn.init.zeros_(self.center_pred.weight)
+        nn.init.zeros_(self.center_pred.bias)
+    def forward(self, fpn_features: List[Tensor]) -> Tuple[List[Tensor], List[Tensor], List[Tensor]]:
+        cls_logits, box_regs, centernesses = [], [], []
+        for level_idx, feat in enumerate(fpn_features):
+            cls_feat = self.cls_tower(feat)
+            reg_feat = self.reg_tower(feat)
+            cls_logits.append(self.cls_pred(cls_feat))
+            reg_raw = self.reg_pred(reg_feat) * self.scales[level_idx]
+            reg_raw = reg_raw.clamp(min=-10.0, max=10.0)
+            box_regs.append(torch.exp(reg_raw))
+            centernesses.append(self.center_pred(reg_feat))
+        return cls_logits, box_regs, centernesses
+class DetectionHead(nn.Module):
+    """Combined SFP + FCOS head."""
+    def __init__(self, in_channels: int = 768, fpn_channels: int = 256, num_classes: int = 80, num_convs: int = 4):
+        super().__init__()
+        self.fpn = SimpleFeaturePyramid(in_channels=in_channels, fpn_channels=fpn_channels)
+        self.head = FCOSHead(fpn_channels=fpn_channels, num_classes=num_classes, num_convs=num_convs)
+        self.num_classes = num_classes
+    def forward(self, spatial_features: Tensor) -> Tuple[List[Tensor], List[Tensor], List[Tensor]]:
+        fpn = self.fpn(spatial_features)
+        return self.head(fpn)
+def _make_locations(feature_sizes: List[Tuple[int, int]], strides: List[int], device) -> List[Tensor]:
+    """Per-level center coordinates of feature-map locations in image space."""
+    all_locs = []
+    for (h, w), s in zip(feature_sizes, strides):
+        ys = (torch.arange(h, device=device, dtype=torch.float32) + 0.5) * s
+        xs = (torch.arange(w, device=device, dtype=torch.float32) + 0.5) * s
+        grid_y, grid_x = torch.meshgrid(ys, xs, indexing="ij")
+        locs = torch.stack([grid_x.flatten(), grid_y.flatten()], dim=-1)
+        all_locs.append(locs)
+    return all_locs
+@torch.inference_mode()
+def _decode_detections(
+    cls_logits_per_level: List[Tensor],
+    box_regs_per_level: List[Tensor],
+    centernesses_per_level: List[Tensor],
+    locations_per_level: List[Tensor],
+    image_sizes: List[Tuple[int, int]],
+    score_thresh: float = 0.05,
+    nms_thresh: float = 0.5,
+    max_per_level: int = 1000,
+    max_per_image: int = 100,
+) -> List[Dict[str, Tensor]]:
+    """Convert per-level logits/regs/centerness into per-image detections (xyxy boxes)."""
+    B = cls_logits_per_level[0].shape[0]
+    num_classes = cls_logits_per_level[0].shape[1]
+    device = cls_logits_per_level[0].device
+    per_image_results = []
+    for image_idx in range(B):
+        all_boxes, all_scores, all_labels = [], [], []
+        for cls_l, reg_l, ctr_l, locs_l in zip(
+            cls_logits_per_level, box_regs_per_level, centernesses_per_level, locations_per_level
+        ):
+            cls = cls_l[image_idx].permute(1, 2, 0).reshape(-1, num_classes)
+            reg = reg_l[image_idx].permute(1, 2, 0).reshape(-1, 4)
+            ctr = ctr_l[image_idx].permute(1, 2, 0).reshape(-1)
+            cls_prob = torch.sigmoid(cls)
+            ctr_prob = torch.sigmoid(ctr)
+            scores = cls_prob * ctr_prob[:, None]
+            mask = scores > score_thresh
+            if not mask.any():
+                continue
+            cand_loc, cand_cls = mask.nonzero(as_tuple=True)
+            cand_scores = scores[cand_loc, cand_cls]
+            if cand_scores.numel() > max_per_level:
+                top = cand_scores.topk(max_per_level)
+                cand_scores = top.values
+                idx = top.indices
+                cand_loc = cand_loc[idx]
+                cand_cls = cand_cls[idx]
+            cand_locs_xy = locs_l[cand_loc]
+            cand_reg = reg[cand_loc]
+            boxes = torch.stack([
+                cand_locs_xy[:, 0] - cand_reg[:, 0],
+                cand_locs_xy[:, 1] - cand_reg[:, 1],
+                cand_locs_xy[:, 0] + cand_reg[:, 2],
+                cand_locs_xy[:, 1] + cand_reg[:, 3],
+            ], dim=-1)
+            all_boxes.append(boxes)
+            all_scores.append(cand_scores)
+            all_labels.append(cand_cls)
+        if all_boxes:
+            boxes = torch.cat(all_boxes, dim=0)
+            scores = torch.cat(all_scores, dim=0)
+            labels = torch.cat(all_labels, dim=0)
+            H, W = image_sizes[image_idx]
+            boxes[:, 0::2] = boxes[:, 0::2].clamp(0, W)
+            boxes[:, 1::2] = boxes[:, 1::2].clamp(0, H)
+            keep_all = []
+            for c in labels.unique():
+                cm = labels == c
+                keep = nms(boxes[cm], scores[cm], nms_thresh)
+                keep_idx = cm.nonzero(as_tuple=True)[0][keep]
+                keep_all.append(keep_idx)
+            keep_all = torch.cat(keep_all, dim=0)
+            boxes = boxes[keep_all]
+            scores = scores[keep_all]
+            labels = labels[keep_all]
+            if scores.numel() > max_per_image:
+                top = scores.topk(max_per_image)
+                boxes = boxes[top.indices]
+                scores = top.values
+                labels = labels[top.indices]
+        else:
+            boxes = torch.zeros((0, 4), device=device)
+            scores = torch.zeros((0,), device=device)
+            labels = torch.zeros((0,), dtype=torch.long, device=device)
+        per_image_results.append({"boxes": boxes, "scores": scores, "labels": labels})
+    return per_image_results
+def _letterbox_to_square(image: Image.Image, resolution: int) -> Tuple[Image.Image, float, Tuple[int, int]]:
+    """Resize preserving aspect ratio and pad bottom/right with black. Matches the training transform."""
+    W0, H0 = image.size
+    scale = resolution / max(H0, W0)
+    new_w = int(round(W0 * scale))
+    new_h = int(round(H0 * scale))
+    resized = image.resize((new_w, new_h), Image.BILINEAR)
+    canvas = Image.new("RGB", (resolution, resolution), (0, 0, 0))
+    canvas.paste(resized, (0, 0))
+    return canvas, scale, (W0, H0)
 # ===========================================================================
 # Argus model (transformers-compatible)
 # ===========================================================================
         num_imagenet_classes: int = 1000,
         class_ids: Optional[list] = None,
         class_names: Optional[list] = None,
+        detection_num_classes: int = 80,
+        detection_fpn_channels: int = 256,
+        detection_num_convs: int = 4,
+        detection_class_names: Optional[list] = None,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.num_imagenet_classes = num_imagenet_classes
         self.class_ids = class_ids or []
         self.class_names = class_names or []
+        self.detection_num_classes = detection_num_classes
+        self.detection_fpn_channels = detection_fpn_channels
+        self.detection_num_convs = detection_num_convs
+        self.detection_class_names = detection_class_names or list(COCO_CLASSES)
 class Argus(PreTrainedModel):
             torch.zeros(config.num_imagenet_classes),
             persistent=True,
         )
+        self.detection_head = DetectionHead(
+            in_channels=config.embed_dim,
+            fpn_channels=config.detection_fpn_channels,
+            num_classes=config.detection_num_classes,
+            num_convs=config.detection_num_convs,
+        )
         for p in self.backbone.parameters():
             p.requires_grad = False
         self.backbone.eval()
         self.seg_head.eval()
         self.depth_head.eval()
+        self.detection_head.eval()
     def _init_weights(self, module):
+        # HF reallocates missing buffers and parameters with torch.empty()
+        # (uninitialized memory) on from_pretrained. Populate sensible defaults
+        # for the standard layer types used by the detection head, and zero any
+        # Argus-level buffer that came back NaN.
+        if isinstance(module, (nn.Conv2d, nn.ConvTranspose2d)):
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.GroupNorm):
+            nn.init.ones_(module.weight)
+            nn.init.zeros_(module.bias)
         if module is self:
             for name in ("class_prototypes", "class_logit_weight", "class_logit_bias"):
                 if hasattr(self, name):
         cls = F.normalize(cls, dim=-1)
         if method == "knn":
+            proto = self.class_prototypes.to(cls.dtype)
+            scores_full = cls @ proto.T  # cosine similarity in [-1, 1]
         elif method == "softmax":
+            w = self.class_logit_weight.to(cls.dtype)
+            b = self.class_logit_bias.to(cls.dtype)
+            logits = F.linear(cls, w, b)
             scores_full = F.softmax(logits, dim=-1)  # in [0, 1]
         else:
             raise ValueError(f"unknown classification method: {method!r} (expected 'knn' or 'softmax')")
             preds.append([px / resolution * tw, py / resolution * th])
         return preds
+    @torch.inference_mode()
+    def detect(
+        self,
+        image_or_images,
+        resolution: int = 640,
+        score_thresh: float = 0.05,
+        nms_thresh: float = 0.5,
+        max_per_image: int = 100,
+    ):
+        single, images = _normalize_image_input(image_or_images)
+        # Letterbox each image to match the training transform (resize long side
+        # to `resolution`, pad bottom/right with black). Box coordinates are
+        # recovered after decoding by unscaling.
+        canvases, scales, orig_sizes = [], [], []
+        for img in images:
+            canvas, scale, orig = _letterbox_to_square(img, resolution)
+            canvases.append(canvas)
+            scales.append(scale)
+            orig_sizes.append(orig)
+        det_normalize = v2.Compose([
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+            v2.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
+        ])
+        batch = torch.stack([det_normalize(c) for c in canvases]).to(self.device)
+        _, spatial = self._extract(batch)
+        with torch.autocast(self.device.type, dtype=torch.bfloat16, enabled=self.device.type == "cuda"):
+            cls_logits, box_regs, centernesses = self.detection_head(spatial)
+        cls_logits = [c.float() for c in cls_logits]
+        box_regs = [b.float() for b in box_regs]
+        centernesses = [c.float() for c in centernesses]
+        feature_sizes = [(cl.shape[2], cl.shape[3]) for cl in cls_logits]
+        locations = _make_locations(feature_sizes, FPN_STRIDES, spatial.device)
+        image_sizes = [(resolution, resolution)] * len(images)
+        results = _decode_detections(
+            cls_logits, box_regs, centernesses, locations,
+            image_sizes=image_sizes,
+            score_thresh=score_thresh,
+            nms_thresh=nms_thresh,
+            max_per_image=max_per_image,
+        )
+        class_names = self.config.detection_class_names
+        formatted = []
+        for i, r in enumerate(results):
+            scale = scales[i]
+            orig_w, orig_h = orig_sizes[i]
+            boxes = r["boxes"].cpu().numpy() / scale
+            boxes[:, 0::2] = boxes[:, 0::2].clip(0, orig_w)
+            boxes[:, 1::2] = boxes[:, 1::2].clip(0, orig_h)
+            detections = []
+            for box, score, label in zip(
+                boxes, r["scores"].cpu().numpy(), r["labels"].cpu().numpy()
+            ):
+                detections.append({
+                    "box": [float(v) for v in box.tolist()],
+                    "score": float(score),
+                    "label": int(label),
+                    "class_name": class_names[int(label)] if int(label) < len(class_names) else f"class_{int(label)}",
+                })
+            formatted.append(detections)
+        return formatted[0] if single else formatted
     def perceive(self, image_or_images, return_confidence: bool = False):
         single, images = _normalize_image_input(image_or_images)