Upload model

Browse files

Files changed (4) hide show

config.json +1 -1
configuration.py +1 -1
model.safetensors +2 -2
modeling.py +72 -77

config.json CHANGED Viewed

@@ -42,5 +42,5 @@
   "transformers_version": "4.51.3",
   "use_pretrained_backbone": true,
   "use_timm_backbone": false,
-  "window_size": 16
 }

   "transformers_version": "4.51.3",
   "use_pretrained_backbone": true,
   "use_timm_backbone": false,
+  "window_size": 8
 }

configuration.py CHANGED Viewed

@@ -19,7 +19,7 @@ class LSPDetrConfig(PretrainedConfig):
         depths: tuple[int, ...] = (6, 2, 2),
         query_block_size: int = 16,
         num_heads: int = 12,
-        window_size: int = 16,
         tgt_window_sizes: tuple[int, ...] = (8, 8, 8),
         src_window_sizes: tuple[int, ...] = (8, 16, 32),
         num_radial_distances: int = 64,

         depths: tuple[int, ...] = (6, 2, 2),
         query_block_size: int = 16,
         num_heads: int = 12,
+        window_size: int = 8,
         tgt_window_sizes: tuple[int, ...] = (8, 8, 8),
         src_window_sizes: tuple[int, ...] = (8, 16, 32),
         num_radial_distances: int = 64,

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6411cad5a0ebad05cbeb8324502f020a4a2a145fa4605dd09757cedb1018ad45
-size 205648888

 version https://git-lfs.github.com/spec/v1
+oid sha256:7f85a91f41a67c7c3f6fb50f71f41e95e1e47c72e561b98308d65d899160de43
+size 204465704

modeling.py CHANGED Viewed

@@ -21,7 +21,7 @@ class MLP(nn.Sequential):
         hidden_dim: int,
         output_dim: int,
         num_layers: int,
-        act_layer: type[nn.Module] = nn.ReLU,
         dropout: float = 0.0,
     ) -> None:
         assert num_layers > 1
@@ -65,6 +65,7 @@ class FeedForward(nn.Module):
 def init_freqs(head_dim: int, num_heads: int, pos_dim: int, theta: float) -> Tensor:
     freqs_x = []
     freqs_y = []
     freqs = 1 / (theta ** (torch.arange(0, head_dim, 2 * pos_dim).float() / head_dim))
@@ -168,33 +169,15 @@ def maybe_pad(x: Tensor, window_size: int) -> Tensor:
 @torch.autocast("cuda", enabled=False)
-def relative_to_absolute_points(points: Tensor, height: int, width: int) -> Tensor:
-    points = points.sigmoid()
-    h, w = points.shape[1:3]
-    step_x = width / w
-    step_y = height / h
-    anchor_x = torch.arange(0, width, step_x, device=points.device)[:w]
-    anchor_y = torch.arange(0, height, step_y, device=points.device)[:h, None]
-    absolute_x = points[..., 0] * step_x + anchor_x
-    absolute_y = points[..., 1] * step_y + anchor_y
-    return torch.stack((absolute_x, absolute_y), dim=-1)
-@torch.autocast("cuda", enabled=False)
-def relative_to_absolute_points_normalized(points: Tensor) -> Tensor:
-    points = points.sigmoid()
-    h, w = points.shape[1:3]
-    anchor_x = torch.arange(0, 1, 1 / w, device=points.device)[:w]
-    anchor_y = torch.arange(0, 1, 1 / h, device=points.device)[:h, None]
-    absolute_x = points[..., 0] / w + anchor_x
-    absolute_y = points[..., 1] / h + anchor_y
     return torch.stack((absolute_x, absolute_y), dim=-1)
@@ -297,8 +280,8 @@ class WindowCrossAttention(nn.Module):
         # partition windows
         tgt = window_partition(tgt, self.tgt_window_size).flatten(1, 2)
-        tgt_coords = window_partition(tgt_coords, self.tgt_window_size).flatten(1, 2)
         src = window_partition(src, self.src_window_size).flatten(1, 2)
         src_coord = window_partition(src_coord, self.src_window_size).flatten(1, 2)
         attn_mask = self.get_attn_mask(
@@ -509,31 +492,13 @@ class Stage(nn.Module):
         return tgt
-class FeatureSampling(nn.Module):
-    def __init__(self, in_dim: int, out_dim: int) -> None:
-        super().__init__()
-        self.reduction = nn.Linear(in_dim, out_dim, bias=False)
-        self.norm = nn.LayerNorm(out_dim)
-    def forward(self, points: Tensor, feature: Tensor) -> Tensor:
-        x = F.grid_sample(feature, points * 2 - 1, align_corners=False)
-        return self.norm(self.reduction(rearrange(x, "b c h w -> b h w c")))
 class LSPTransformer(nn.Module):
-    def __init__(
-        self,
-        config: LSPDetrConfig,
-        bottleneck_channels: int,
-        feature_channels: list[int],
-    ) -> None:
         super().__init__()
         self.query_block_size = config.query_block_size
         self.num_radial_distances = config.num_radial_distances
-        self.feature_sampling = FeatureSampling(bottleneck_channels, config.dim)
         self.stages = nn.ModuleList()
         for i, depth in enumerate(config.depths):
             stage = Stage(
@@ -552,9 +517,9 @@ class LSPTransformer(nn.Module):
         # output heads
         self.class_head = nn.Linear(config.dim, config.num_classes + 1, bias=False)
-        self.point_head = MLP(config.dim, config.dim, 2, 3)
         self.radial_distances_head = MLP(
-            config.dim, config.dim, config.num_radial_distances, 3
         )
         self.init_weights()
@@ -565,29 +530,24 @@ class LSPTransformer(nn.Module):
         nn.init.constant_(self.point_head[-1].bias, 0.0)
     def forward(
-        self, bottleneck: Tensor, features: list[Tensor], height: int, width: int
     ) -> dict[str, Tensor | list[dict[str, Tensor]]]:
-        b = bottleneck.size(0)
         src = []
         src_coords = []
-        for i, feature in enumerate(reversed(features)):
-            h, w = feature.shape[2:4]
             coords = torch.zeros(b, h, w, 2, dtype=torch.float32, device=feature.device)
             src.append(self.input_norm[i](rearrange(feature, "b c h w -> b h w c")))
-            src_coords.append(relative_to_absolute_points(coords, height, width))
-        ref_points = torch.zeros(
-            b,
-            height // self.query_block_size,
-            width // self.query_block_size,
-            2,
-            dtype=torch.float32,
-            device=bottleneck.device,
-        )  # center positions
-        tgt = self.feature_sampling(
-            relative_to_absolute_points_normalized(ref_points), bottleneck
-        )
         logits_list: list[Tensor] = []
         ref_points_list: list[Tensor] = []
@@ -598,7 +558,9 @@ class LSPTransformer(nn.Module):
             tgt = stage(
                 tgt=tgt,
                 src=src[i],
-                tgt_coords=relative_to_absolute_points(ref_points, height, width),
                 src_coords=src_coords[i],
             )
@@ -608,8 +570,10 @@ class LSPTransformer(nn.Module):
             logits = self.class_head(tgt)
             ref_points_list.append(
-                relative_to_absolute_points_normalized(
-                    new_ref_points + delta_point
                 ).flatten(1, 2)
             )
             logits_list.append(logits.flatten(1, 2))
@@ -622,8 +586,8 @@ class LSPTransformer(nn.Module):
             "logits": logits_list[-1],
             "points": ref_points_list[-1],
             "radial_distances": radial_distances_list[-1],
-            "absolute_points": relative_to_absolute_points(
-                ref_points, height, width
             ).flatten(1, 2),
             "aux_outputs": [
                 {
@@ -641,17 +605,48 @@ class LSPTransformer(nn.Module):
         }
 class LSPDetrModel(PreTrainedModel):
     config_class = LSPDetrConfig
     def __init__(self, config: LSPDetrConfig) -> None:
         super().__init__(config)
         self.backbone = load_backbone(config)
-        _, *feature_channels, bottleneck = self.backbone.num_features
-        self.decode_head = LSPTransformer(config, bottleneck, feature_channels[::-1])
-    def forward(self, image: Tensor) -> dict[str, Tensor]:
-        *features, bottleneck = self.backbone(image).feature_maps
-        height, width = image.shape[2:]
-        return self.decode_head(bottleneck, features, height, width)

         hidden_dim: int,
         output_dim: int,
         num_layers: int,
+        act_layer: type[nn.Module] = nn.GELU,
         dropout: float = 0.0,
     ) -> None:
         assert num_layers > 1
 def init_freqs(head_dim: int, num_heads: int, pos_dim: int, theta: float) -> Tensor:
+    """Taken from https://github.com/naver-ai/rope-vit/blob/main/self-attn/rope_self_attn.py."""
     freqs_x = []
     freqs_y = []
     freqs = 1 / (theta ** (torch.arange(0, head_dim, 2 * pos_dim).float() / head_dim))
 @torch.autocast("cuda", enabled=False)
+def relative_to_absolute_pos(pos: Tensor, step_x: float, step_y: float) -> Tensor:
+    pos = pos.sigmoid()
+    h, w = pos.shape[1:3]
+    anchor_x = torch.arange(w, dtype=torch.float32, device=pos.device) * step_x
+    anchor_y = torch.arange(h, dtype=torch.float32, device=pos.device) * step_y
+    absolute_x = pos[..., 0] * step_x + anchor_x
+    absolute_y = pos[..., 1] * step_y + anchor_y.unsqueeze(1)
     return torch.stack((absolute_x, absolute_y), dim=-1)
         # partition windows
         tgt = window_partition(tgt, self.tgt_window_size).flatten(1, 2)
         src = window_partition(src, self.src_window_size).flatten(1, 2)
+        tgt_coords = window_partition(tgt_coords, self.tgt_window_size).flatten(1, 2)
         src_coord = window_partition(src_coord, self.src_window_size).flatten(1, 2)
         attn_mask = self.get_attn_mask(
         return tgt
 class LSPTransformer(nn.Module):
+    def __init__(self, config: LSPDetrConfig, feature_channels: list[int]) -> None:
         super().__init__()
         self.query_block_size = config.query_block_size
         self.num_radial_distances = config.num_radial_distances
         self.stages = nn.ModuleList()
         for i, depth in enumerate(config.depths):
             stage = Stage(
         # output heads
         self.class_head = nn.Linear(config.dim, config.num_classes + 1, bias=False)
+        self.point_head = MLP(config.dim, config.dim, 2, 2)
         self.radial_distances_head = MLP(
+            config.dim, config.dim, config.num_radial_distances, 2
         )
         self.init_weights()
         nn.init.constant_(self.point_head[-1].bias, 0.0)
     def forward(
+        self,
+        tgt: Tensor,
+        ref_points: Tensor,
+        features: list[Tensor],
+        height: int,
+        width: int,
     ) -> dict[str, Tensor | list[dict[str, Tensor]]]:
         src = []
         src_coords = []
+        for i, feature in enumerate(features):
+            b, _, h, w = feature.shape
             coords = torch.zeros(b, h, w, 2, dtype=torch.float32, device=feature.device)
             src.append(self.input_norm[i](rearrange(feature, "b c h w -> b h w c")))
+            src_coords.append(
+                relative_to_absolute_pos(
+                    coords, step_x=math.ceil(width / w), step_y=math.ceil(height / h)
+                )
+            )
         logits_list: list[Tensor] = []
         ref_points_list: list[Tensor] = []
             tgt = stage(
                 tgt=tgt,
                 src=src[i],
+                tgt_coords=relative_to_absolute_pos(
+                    ref_points, self.query_block_size, self.query_block_size
+                ),
                 src_coords=src_coords[i],
             )
             logits = self.class_head(tgt)
             ref_points_list.append(
+                relative_to_absolute_pos(
+                    new_ref_points + delta_point,
+                    step_x=self.query_block_size / width,
+                    step_y=self.query_block_size / height,
                 ).flatten(1, 2)
             )
             logits_list.append(logits.flatten(1, 2))
             "logits": logits_list[-1],
             "points": ref_points_list[-1],
             "radial_distances": radial_distances_list[-1],
+            "absolute_points": relative_to_absolute_pos(
+                ref_points, self.query_block_size, self.query_block_size
             ).flatten(1, 2),
             "aux_outputs": [
                 {
         }
+class FeatureSampling(nn.Module):
+    def __init__(self, in_dim: int, out_dim: int) -> None:
+        super().__init__()
+        self.reduction = nn.Linear(in_dim, out_dim, bias=False)
+        self.norm = nn.LayerNorm(out_dim)
+    def forward(self, points: Tensor, feature: Tensor) -> Tensor:
+        x = F.grid_sample(feature, points * 2 - 1, align_corners=False)
+        return self.norm(self.reduction(rearrange(x, "b c h w -> b h w c")))
 class LSPDetrModel(PreTrainedModel):
     config_class = LSPDetrConfig
     def __init__(self, config: LSPDetrConfig) -> None:
         super().__init__(config)
+        self.query_block_size = config.query_block_size
         self.backbone = load_backbone(config)
+        _, *feature_channels, neck = self.backbone.num_features
+        self.feature_sampling = FeatureSampling(neck, config.dim)
+        self.decode_head = LSPTransformer(config, feature_channels[::-1])
+    def forward(self, pixel_values: Tensor) -> dict[str, Tensor]:
+        b, _, h, w = pixel_values.shape
+        *features, neck = self.backbone(pixel_values).feature_maps
+        ref_points = torch.zeros(
+            b,
+            math.ceil(h / self.query_block_size),
+            math.ceil(w / self.query_block_size),
+            2,
+            dtype=torch.float32,
+            device=neck.device,
+        )  # center positions
+        tgt = self.feature_sampling(
+            relative_to_absolute_pos(
+                ref_points, self.query_block_size, self.query_block_size
+            ),
+            neck,
+        )
+        return self.decode_head(tgt, ref_points, features[::-1], h, w)