Upload model

Browse files

Files changed (4) hide show

config.json +1 -7
configuration.py +20 -4
model.safetensors +2 -2
modeling.py +109 -125

config.json CHANGED Viewed

@@ -14,14 +14,8 @@
   ],
   "dim": 384,
   "dropout": 0.1,
-  "in_channels": [
-    768,
-    384,
-    192,
-    96
-  ],
   "model_type": "lsp_detr",
-  "num_classes": 2,
   "num_heads": 12,
   "num_radial_distances": 64,
   "query_block_size": 16,

   ],
   "dim": 384,
   "dropout": 0.1,
   "model_type": "lsp_detr",
+  "num_classes": 1,
   "num_heads": 12,
   "num_radial_distances": 64,
   "query_block_size": 16,

configuration.py CHANGED Viewed

@@ -1,4 +1,7 @@
 from transformers import PretrainedConfig
 class LSPDetrConfig(PretrainedConfig):
@@ -6,11 +9,14 @@ class LSPDetrConfig(PretrainedConfig):
     def __init__(
         self,
-        backbone="microsoft/swinv2-tiny-patch4-window16-256",
         dim: int = 384,
-        num_classes: int = 2,
         depths: tuple[int, ...] = (6, 2, 2),
-        in_channels: tuple[int, ...] = (768, 384, 192, 96),
         query_block_size: int = 16,
         num_heads: int = 12,
         window_size: int = 16,
@@ -20,11 +26,21 @@ class LSPDetrConfig(PretrainedConfig):
         dropout: float = 0.1,
         **kwargs,
     ) -> None:
         self.backbone = backbone
         self.dim = dim
         self.num_classes = num_classes
         self.depths = depths
-        self.in_channels = in_channels
         self.query_block_size = query_block_size
         self.num_heads = num_heads
         self.window_size = window_size

+from typing import Any
 from transformers import PretrainedConfig
+from transformers.utils.backbone_utils import verify_backbone_config_arguments
 class LSPDetrConfig(PretrainedConfig):
     def __init__(
         self,
+        use_timm_backbone: bool = False,
+        use_pretrained_backbone: bool = True,
+        backbone: str = "microsoft/swinv2-tiny-patch4-window16-256",
+        backbone_kwargs: dict[str, Any] | None = None,
+        backbone_config: Any | None = None,
         dim: int = 384,
+        num_classes: int = 1,
         depths: tuple[int, ...] = (6, 2, 2),
         query_block_size: int = 16,
         num_heads: int = 12,
         window_size: int = 16,
         dropout: float = 0.1,
         **kwargs,
     ) -> None:
+        if backbone_kwargs is None:
+            backbone_kwargs = {"out_features": ["stage1", "stage2", "stage3", "stage4"]}
+        verify_backbone_config_arguments(
+            use_timm_backbone=use_timm_backbone,
+            use_pretrained_backbone=use_pretrained_backbone,
+            backbone=backbone,
+            backbone_config=backbone_config,
+            backbone_kwargs=backbone_kwargs,
+        )
         self.backbone = backbone
         self.dim = dim
         self.num_classes = num_classes
         self.depths = depths
         self.query_block_size = query_block_size
         self.num_heads = num_heads
         self.window_size = window_size

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:34df87bc194a31875e6fad557746c2c5e94f027039211df60054807f61107bd0
-size 205650424

 version https://git-lfs.github.com/spec/v1
+oid sha256:6411cad5a0ebad05cbeb8324502f020a4a2a145fa4605dd09757cedb1018ad45
+size 205648888

modeling.py CHANGED Viewed

@@ -5,12 +5,65 @@ import torch.nn.functional as F
 from einops import rearrange, repeat
 from torch import Tensor, nn
 from torch.nn.utils import parametrize
-from transformers import PreTrainedModel, Swinv2Backbone
 from transformers.models.swinv2.modeling_swinv2 import window_partition, window_reverse
 from .configuration import LSPDetrConfig
 def init_freqs(head_dim: int, num_heads: int, pos_dim: int, theta: float) -> Tensor:
     freqs_x = []
     freqs_y = []
@@ -107,56 +160,11 @@ class CayleySTRING(nn.Module):
         return out.type_as(x)
-class MLP(nn.Sequential):
-    """Very simple multi-layer perceptron."""
-    def __init__(
-        self,
-        input_dim: int,
-        hidden_dim: int,
-        output_dim: int,
-        num_layers: int,
-        act_layer: type[nn.Module] = nn.ReLU,
-        dropout: float = 0.0,
-    ) -> None:
-        assert num_layers > 1
-        layers = []
-        h = [hidden_dim] * (num_layers - 1)
-        for n, k in zip([input_dim, *h], h, strict=False):
-            layers.append(nn.Linear(n, k))
-            layers.append(act_layer())
-            if dropout > 0:
-                layers.append(nn.Dropout(dropout))
-        layers.append(nn.Linear(hidden_dim, output_dim))
-        super().__init__(*layers)
-class FeedForward(nn.Module):
-    """FeedForward module.
-    Taken from https://github.com/meta-llama/llama-models/blob/main/models/llama4/ffn.py
-    """
-    def __init__(self, dim: int, hidden_dim: int, multiple_of: int = 256) -> None:
-        """Initialize the FeedForward module.
-        Args:
-            dim (int): Input dimension.
-            hidden_dim (int): Hidden dimension of the feedforward layer.
-            multiple_of (int): Value to ensure hidden dimension is a multiple of this value.
-        """
-        super().__init__()
-        hidden_dim = int(2 * hidden_dim / 3)
-        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
-        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
-        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
-        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
-    def forward(self, x: Tensor) -> Tensor:
-        return self.w2(F.silu(self.w1(x)) * self.w3(x))
 @torch.autocast("cuda", enabled=False)
@@ -261,6 +269,13 @@ class WindowCrossAttention(nn.Module):
         self, tgt: Tensor, src: Tensor, tgt_coords: Tensor, src_coord: Tensor
     ) -> Tensor:
         b, h, w, c = tgt.shape
         src_h, src_w = src.shape[1:3]
         # cyclic shift
@@ -286,7 +301,9 @@ class WindowCrossAttention(nn.Module):
         src = window_partition(src, self.src_window_size).flatten(1, 2)
         src_coord = window_partition(src_coord, self.src_window_size).flatten(1, 2)
-        attn_mask = self.get_attn_mask(h, w, src_h, src_w, tgt.device, tgt.dtype)
         if attn_mask is not None:
             attn_mask = repeat(attn_mask, "n l s -> (b n) h l s", b=b, h=self.num_heads)
@@ -307,7 +324,7 @@ class WindowCrossAttention(nn.Module):
         # merge windows
         tgt = tgt.view(-1, self.tgt_window_size, self.tgt_window_size, c)
-        tgt = window_reverse(tgt, self.tgt_window_size, h, w)
         # reverse cyclic shift
         if self.tgt_shift_size > 0:
@@ -315,7 +332,7 @@ class WindowCrossAttention(nn.Module):
                 tgt, shifts=(self.tgt_shift_size, self.tgt_shift_size), dims=(1, 2)
             )
-        return tgt
 class WindowSelfAttention(nn.Module):
@@ -360,6 +377,11 @@ class WindowSelfAttention(nn.Module):
         """
         b, h, w, c = x.shape
         # cyclic shift
         if self.shift_size > 0:
             x = x.roll(shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
@@ -371,7 +393,7 @@ class WindowSelfAttention(nn.Module):
         x = window_partition(x, self.window_size).flatten(1, 2)
         coords = window_partition(coords, self.window_size).flatten(1, 2)
-        attn_mask = self.get_attn_mask(h, w, x.device, x.dtype)
         if attn_mask is not None:
             attn_mask = repeat(attn_mask, "n l s -> (b n) h l s", b=b, h=self.num_heads)
@@ -390,13 +412,13 @@ class WindowSelfAttention(nn.Module):
         # merge windows
         x = x.view(-1, self.window_size, self.window_size, c)
-        x = window_reverse(x, self.window_size, h, w)
         # reverse cyclic shift
         if self.shift_size > 0:
             x = torch.roll(x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
-        return x
 class Block(nn.Module):
@@ -501,47 +523,39 @@ class FeatureSampling(nn.Module):
 class LSPTransformer(nn.Module):
     def __init__(
         self,
-        dim: int,
-        num_classes: int,
-        query_block_size: int,
-        in_channels: list[int],
-        depths: list[int],
-        num_heads: int,
-        window_size: int,
-        tgt_window_sizes: list[int],
-        src_window_sizes: list[int],
-        num_radial_distances: int,
-        dropout: float = 0.0,
     ) -> None:
         super().__init__()
-        self.dim = dim
-        self.query_block_size = query_block_size
-        self.num_radial_distances = num_radial_distances
-        bottleneck, *in_channels = in_channels
-        self.feature_sampling = FeatureSampling(bottleneck, dim)
         self.stages = nn.ModuleList()
-        for i, depth in enumerate(depths):
             stage = Stage(
-                dim=dim,
-                src_dim=in_channels[i],
                 depth=depth,
-                num_heads=num_heads,
-                window_size=window_size,
-                tgt_window_size=tgt_window_sizes[i],
-                src_window_size=src_window_sizes[i],
-                dropout=dropout,
             )
             self.stages.append(stage)
-        self.input_norm = nn.ModuleList(nn.LayerNorm(d) for d in in_channels)
         # output heads
-        self.class_head = nn.Linear(dim, num_classes + 1, bias=False)
-        self.point_head = MLP(dim, dim, 2, 3)
-        self.radial_distances_head = MLP(dim, dim, num_radial_distances, 3)
         self.init_weights()
@@ -551,15 +565,13 @@ class LSPTransformer(nn.Module):
         nn.init.constant_(self.point_head[-1].bias, 0.0)
     def forward(
-        self, multi_scale_features: list[Tensor], height: int, width: int
     ) -> dict[str, Tensor | list[dict[str, Tensor]]]:
-        *multi_scale_features, bottleneck = multi_scale_features
         b = bottleneck.size(0)
         src = []
         src_coords = []
-        for i, feature in enumerate(reversed(multi_scale_features)):
             h, w = feature.shape[2:4]
             coords = torch.zeros(b, h, w, 2, dtype=torch.float32, device=feature.device)
             src.append(self.input_norm[i](rearrange(feature, "b c h w -> b h w c")))
@@ -610,10 +622,9 @@ class LSPTransformer(nn.Module):
             "logits": logits_list[-1],
             "points": ref_points_list[-1],
             "radial_distances": radial_distances_list[-1],
-            "polygons": self.get_polygons(
-                relative_to_absolute_points(ref_points, height, width).flatten(1, 2),
-                radial_distances_list[-1],
-            ),
             "aux_outputs": [
                 {
                     "logits": a,
@@ -629,19 +640,6 @@ class LSPTransformer(nn.Module):
             ],
         }
-    @torch.no_grad()
-    @torch.autocast("cuda", enabled=False)
-    def get_polygons(self, ref_points: Tensor, radial_distances: Tensor) -> Tensor:
-        t = torch.linspace(
-            0, 1, self.num_radial_distances + 1, device=ref_points.device
-        )[:-1]
-        cos = torch.cos(2 * torch.pi * t)
-        sin = torch.sin(2 * torch.pi * t)
-        radial_distances = radial_distances.expm1()
-        polar = radial_distances.unsqueeze(-1) * torch.stack([sin, cos], dim=-1)
-        return ref_points.unsqueeze(-2) + polar
 class LSPDetrModel(PreTrainedModel):
     config_class = LSPDetrConfig
@@ -649,25 +647,11 @@ class LSPDetrModel(PreTrainedModel):
     def __init__(self, config: LSPDetrConfig) -> None:
         super().__init__(config)
-        self.backbone = Swinv2Backbone.from_pretrained(
-            config.backbone, out_features=["stage1", "stage2", "stage3", "stage4"]
-        )
-        self.decode_head = LSPTransformer(
-            dim=config.dim,
-            num_classes=config.num_classes,
-            query_block_size=config.query_block_size,
-            in_channels=config.in_channels,
-            depths=config.depths,
-            num_heads=config.num_heads,
-            window_size=config.window_size,
-            tgt_window_sizes=config.tgt_window_sizes,
-            src_window_sizes=config.src_window_sizes,
-            num_radial_distances=config.num_radial_distances,
-            dropout=config.dropout,
-        )
     def forward(self, image: Tensor) -> dict[str, Tensor]:
-        features = self.backbone(image).feature_maps
         height, width = image.shape[2:]
-        return self.decode_head(features, height, width)

 from einops import rearrange, repeat
 from torch import Tensor, nn
 from torch.nn.utils import parametrize
+from transformers import PreTrainedModel
 from transformers.models.swinv2.modeling_swinv2 import window_partition, window_reverse
+from transformers.utils.backbone_utils import load_backbone
 from .configuration import LSPDetrConfig
+class MLP(nn.Sequential):
+    """Very simple multi-layer perceptron."""
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        num_layers: int,
+        act_layer: type[nn.Module] = nn.ReLU,
+        dropout: float = 0.0,
+    ) -> None:
+        assert num_layers > 1
+        layers = []
+        h = [hidden_dim] * (num_layers - 1)
+        for n, k in zip([input_dim, *h], h, strict=False):
+            layers.append(nn.Linear(n, k))
+            layers.append(act_layer())
+            if dropout > 0:
+                layers.append(nn.Dropout(dropout))
+        layers.append(nn.Linear(hidden_dim, output_dim))
+        super().__init__(*layers)
+class FeedForward(nn.Module):
+    """FeedForward module.
+    Taken from https://github.com/meta-llama/llama-models/blob/main/models/llama4/ffn.py
+    """
+    def __init__(self, dim: int, hidden_dim: int, multiple_of: int = 256) -> None:
+        """Initialize the FeedForward module.
+        Args:
+            dim (int): Input dimension.
+            hidden_dim (int): Hidden dimension of the feedforward layer.
+            multiple_of (int): Value to ensure hidden dimension is a multiple of this value.
+        """
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
 def init_freqs(head_dim: int, num_heads: int, pos_dim: int, theta: float) -> Tensor:
     freqs_x = []
     freqs_y = []
         return out.type_as(x)
+def maybe_pad(x: Tensor, window_size: int) -> Tensor:
+    h, w = x.shape[1:3]
+    pad_right = (window_size - w % window_size) % window_size
+    pad_bottom = (window_size - h % window_size) % window_size
+    return F.pad(x, (0, 0, 0, pad_right, 0, pad_bottom))
 @torch.autocast("cuda", enabled=False)
         self, tgt: Tensor, src: Tensor, tgt_coords: Tensor, src_coord: Tensor
     ) -> Tensor:
         b, h, w, c = tgt.shape
+        # pad to multiples of window size
+        tgt = maybe_pad(tgt, self.tgt_window_size)
+        src = maybe_pad(src, self.src_window_size)
+        tgt_coords = maybe_pad(tgt_coords, self.tgt_window_size)
+        src_coord = maybe_pad(src_coord, self.src_window_size)
+        h_pad, w_pad = tgt.shape[1:3]
         src_h, src_w = src.shape[1:3]
         # cyclic shift
         src = window_partition(src, self.src_window_size).flatten(1, 2)
         src_coord = window_partition(src_coord, self.src_window_size).flatten(1, 2)
+        attn_mask = self.get_attn_mask(
+            h_pad, w_pad, src_h, src_w, tgt.device, tgt.dtype
+        )
         if attn_mask is not None:
             attn_mask = repeat(attn_mask, "n l s -> (b n) h l s", b=b, h=self.num_heads)
         # merge windows
         tgt = tgt.view(-1, self.tgt_window_size, self.tgt_window_size, c)
+        tgt = window_reverse(tgt, self.tgt_window_size, h_pad, w_pad)
         # reverse cyclic shift
         if self.tgt_shift_size > 0:
                 tgt, shifts=(self.tgt_shift_size, self.tgt_shift_size), dims=(1, 2)
             )
+        return tgt[:, :h, :w, :].contiguous()  # remove padding
 class WindowSelfAttention(nn.Module):
         """
         b, h, w, c = x.shape
+        # pad to multiples of window size
+        x = maybe_pad(x, self.window_size)
+        coords = maybe_pad(coords, self.window_size)
+        h_pad, w_pad = x.shape[1:3]
         # cyclic shift
         if self.shift_size > 0:
             x = x.roll(shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
         x = window_partition(x, self.window_size).flatten(1, 2)
         coords = window_partition(coords, self.window_size).flatten(1, 2)
+        attn_mask = self.get_attn_mask(h_pad, w_pad, x.device, x.dtype)
         if attn_mask is not None:
             attn_mask = repeat(attn_mask, "n l s -> (b n) h l s", b=b, h=self.num_heads)
         # merge windows
         x = x.view(-1, self.window_size, self.window_size, c)
+        x = window_reverse(x, self.window_size, h_pad, w_pad)
         # reverse cyclic shift
         if self.shift_size > 0:
             x = torch.roll(x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        return x[:, :h, :w, :].contiguous()  # remove padding
 class Block(nn.Module):
 class LSPTransformer(nn.Module):
     def __init__(
         self,
+        config: LSPDetrConfig,
+        bottleneck_channels: int,
+        feature_channels: list[int],
     ) -> None:
         super().__init__()
+        self.query_block_size = config.query_block_size
+        self.num_radial_distances = config.num_radial_distances
+        self.feature_sampling = FeatureSampling(bottleneck_channels, config.dim)
         self.stages = nn.ModuleList()
+        for i, depth in enumerate(config.depths):
             stage = Stage(
+                dim=config.dim,
+                src_dim=feature_channels[i],
                 depth=depth,
+                num_heads=config.num_heads,
+                window_size=config.window_size,
+                tgt_window_size=config.tgt_window_sizes[i],
+                src_window_size=config.src_window_sizes[i],
+                dropout=config.dropout,
             )
             self.stages.append(stage)
+        self.input_norm = nn.ModuleList(nn.LayerNorm(d) for d in feature_channels)
         # output heads
+        self.class_head = nn.Linear(config.dim, config.num_classes + 1, bias=False)
+        self.point_head = MLP(config.dim, config.dim, 2, 3)
+        self.radial_distances_head = MLP(
+            config.dim, config.dim, config.num_radial_distances, 3
+        )
         self.init_weights()
         nn.init.constant_(self.point_head[-1].bias, 0.0)
     def forward(
+        self, bottleneck: Tensor, features: list[Tensor], height: int, width: int
     ) -> dict[str, Tensor | list[dict[str, Tensor]]]:
         b = bottleneck.size(0)
         src = []
         src_coords = []
+        for i, feature in enumerate(reversed(features)):
             h, w = feature.shape[2:4]
             coords = torch.zeros(b, h, w, 2, dtype=torch.float32, device=feature.device)
             src.append(self.input_norm[i](rearrange(feature, "b c h w -> b h w c")))
             "logits": logits_list[-1],
             "points": ref_points_list[-1],
             "radial_distances": radial_distances_list[-1],
+            "absolute_points": relative_to_absolute_points(
+                ref_points, height, width
+            ).flatten(1, 2),
             "aux_outputs": [
                 {
                     "logits": a,
             ],
         }
 class LSPDetrModel(PreTrainedModel):
     config_class = LSPDetrConfig
     def __init__(self, config: LSPDetrConfig) -> None:
         super().__init__(config)
+        self.backbone = load_backbone(config)
+        _, *feature_channels, bottleneck = self.backbone.num_features
+        self.decode_head = LSPTransformer(config, bottleneck, feature_channels[::-1])
     def forward(self, image: Tensor) -> dict[str, Tensor]:
+        *features, bottleneck = self.backbone(image).feature_maps
         height, width = image.shape[2:]
+        return self.decode_head(bottleneck, features, height, width)