Upload LSPDETR

Browse files

Files changed (4) hide show

config.json +41 -0
configuration.py +35 -0
model.safetensors +3 -0
modeling.py +671 -0

config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "architectures": [
+    "LSPDETR"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration.LSPDETRConfig",
+    "AutoModelForObjectDetection": "modeling.LSPDETR"
+  },
+  "backbone": "microsoft/swinv2-tiny-patch4-window16-256",
+  "depths": [
+    6,
+    2,
+    2
+  ],
+  "dim": 384,
+  "dropout": 0.1,
+  "in_channels": [
+    768,
+    384,
+    192,
+    96
+  ],
+  "model_type": "LSP-DETR",
+  "num_classes": 2,
+  "num_heads": 12,
+  "num_radial_distances": 64,
+  "query_block_size": 16,
+  "src_window_sizes": [
+    8,
+    16,
+    32
+  ],
+  "tgt_window_sizes": [
+    8,
+    8,
+    8
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "window_size": 16
+}

configuration.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from transformers import PretrainedConfig
+class LSPDETRConfig(PretrainedConfig):
+    model_type = "LSP-DETR"
+    def __init__(
+        self,
+        backbone="microsoft/swinv2-tiny-patch4-window16-256",
+        dim: int = 384,
+        num_classes: int = 2,
+        depths: tuple[int, ...] = (6, 2, 2),
+        in_channels: tuple[int, ...] = (768, 384, 192, 96),
+        query_block_size: int = 16,
+        num_heads: int = 12,
+        window_size: int = 16,
+        tgt_window_sizes: tuple[int, ...] = (8, 8, 8),
+        src_window_sizes: tuple[int, ...] = (8, 16, 32),
+        num_radial_distances: int = 64,
+        dropout: float = 0.1,
+        **kwargs,
+    ) -> None:
+        self.backbone = backbone
+        self.dim = dim
+        self.num_classes = num_classes
+        self.depths = depths
+        self.in_channels = in_channels
+        self.query_block_size = query_block_size
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.tgt_window_sizes = tgt_window_sizes
+        self.src_window_sizes = src_window_sizes
+        self.num_radial_distances = num_radial_distances
+        self.dropout = dropout
+        super().__init__(**kwargs)

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e725b30741b7f18033487d32688d1cc223f445318bd0a600f1dca082cd9e9352
+size 205650424

modeling.py ADDED Viewed

	@@ -0,0 +1,671 @@

+import math
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torch import Tensor, nn
+from torch.nn.utils import parametrize
+from transformers import PreTrainedModel, Swinv2Backbone
+from transformers.models.swinv2.modeling_swinv2 import window_partition, window_reverse
+from .configuration import LSPDETRConfig
+def init_freqs(head_dim: int, num_heads: int, pos_dim: int, theta: float) -> Tensor:
+    freqs_x = []
+    freqs_y = []
+    freqs = 1 / (theta ** (torch.arange(0, head_dim, 2 * pos_dim).float() / head_dim))
+    for _ in range(num_heads):
+        angles = torch.rand(1) * 2 * torch.pi
+        fx = torch.cat(
+            [freqs * torch.cos(angles), freqs * torch.cos(torch.pi / 2 + angles)],
+            dim=-1,
+        )
+        fy = torch.cat(
+            [freqs * torch.sin(angles), freqs * torch.sin(torch.pi / 2 + angles)],
+            dim=-1,
+        )
+        freqs_x.append(fx)
+        freqs_y.append(fy)
+    freqs_x = torch.stack(freqs_x, dim=0)
+    freqs_y = torch.stack(freqs_y, dim=0)
+    return torch.stack([freqs_x, freqs_y], dim=0)
+class Skew(nn.Module):
+    """Skew-symmetric matrix parameterization."""
+    def forward(self, x: Tensor) -> Tensor:
+        a = x.triu(1)
+        return a - a.transpose(-1, -2)
+    def right_inverse(self, x: Tensor) -> Tensor:
+        return x.triu(1)
+class CayleySTRING(nn.Module):
+    """Implements the Cayley-STRING positional encoding.
+    Based on "Learning the RoPEs: Better 2D and 3D Position Encodings with STRING"
+    (https://arxiv.org/abs/2502.02562).
+    Applies RoPE followed by multiplication with a learnable orthogonal matrix P
+    parameterized by the Cayley transform: P = (I - S)(I + S)^-1, where S is
+    a learnable skew-symmetric matrix.
+    Args:
+        dim (int): The feature dimension of the input tensor. Must be even.
+        max_seq_len (int): The maximum sequence length.
+        base (int): The base value for the RoPE frequency calculation. Defaults to 10000.
+        pos_dim (int): The dimensionality of the position vectors (e.g., 1 for 1D, 2 for 2D). Defaults to 1.
+    """
+    def __init__(
+        self, dim: int, num_heads: int, pos_dim: int = 2, theta: float = 100.0
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "Dimension must be divisible by num_heads."
+        head_dim = dim // num_heads
+        self.freqs = nn.Parameter(init_freqs(head_dim, num_heads, pos_dim, theta))
+        self.S = nn.Parameter(torch.zeros(head_dim, head_dim))
+        parametrize.register_parametrization(self, "S", Skew())
+        self.register_buffer("I", torch.eye(head_dim), persistent=False)
+        self.init_weights()
+    def init_weights(self) -> None:
+        self.S = nn.init.kaiming_uniform_(self.S, a=math.sqrt(5))
+    @parametrize.cached()
+    @torch.autocast("cuda", enabled=False)
+    def forward(self, x: Tensor, positions: Tensor) -> Tensor:
+        """Apply Cayley-STRING positional encoding.
+        Args:
+            x ([b, h, n, d]): Input tensor.
+            positions ([b, n, pos_dim]): Positions tensor.
+        """
+        # Compute (I + S)^-1 @ x
+        y = torch.linalg.solve(
+            self.I + self.S, rearrange(x.float(), "b h n d -> h d (b n)")
+        )
+        # change of basis
+        px = torch.matmul(self.I - self.S, y)
+        px = rearrange(px, "h d (b n) -> b h n d", b=x.size(0)).contiguous()
+        # apply RoPE-Mixed
+        angles = torch.einsum("bnk,khc->bhnc", positions, self.freqs)
+        freqs_cis = torch.polar(torch.ones_like(angles), angles)
+        px_ = torch.view_as_complex(rearrange(px, "... (d two) -> ... d two", two=2))
+        out = rearrange(torch.view_as_real(px_ * freqs_cis), "... d two -> ... (d two)")
+        return out.type_as(x)
+class MLP(nn.Sequential):
+    """Very simple multi-layer perceptron."""
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        num_layers: int,
+        act_layer: type[nn.Module] = nn.ReLU,
+        dropout: float = 0.0,
+    ) -> None:
+        assert num_layers > 1
+        layers = []
+        h = [hidden_dim] * (num_layers - 1)
+        for n, k in zip([input_dim, *h], h, strict=False):
+            layers.append(nn.Linear(n, k))
+            layers.append(act_layer())
+            if dropout > 0:
+                layers.append(nn.Dropout(dropout))
+        layers.append(nn.Linear(hidden_dim, output_dim))
+        super().__init__(*layers)
+class FeedForward(nn.Module):
+    """FeedForward module.
+    Taken from https://github.com/meta-llama/llama-models/blob/main/models/llama4/ffn.py
+    """
+    def __init__(self, dim: int, hidden_dim: int, multiple_of: int = 256) -> None:
+        """Initialize the FeedForward module.
+        Args:
+            dim (int): Input dimension.
+            hidden_dim (int): Hidden dimension of the feedforward layer.
+            multiple_of (int): Value to ensure hidden dimension is a multiple of this value.
+        """
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+@torch.autocast("cuda", enabled=False)
+def relative_to_absolute_points(points: Tensor, height: int, width: int) -> Tensor:
+    points = points.sigmoid()
+    h, w = points.shape[1:3]
+    step_x = width / w
+    step_y = height / h
+    anchor_x = torch.arange(0, width, step_x, device=points.device)[:w]
+    anchor_y = torch.arange(0, height, step_y, device=points.device)[:h, None]
+    absolute_x = points[..., 0] * step_x + anchor_x
+    absolute_y = points[..., 1] * step_y + anchor_y
+    return torch.stack((absolute_x, absolute_y), dim=-1)
+@torch.autocast("cuda", enabled=False)
+def relative_to_absolute_points_normalized(points: Tensor) -> Tensor:
+    points = points.sigmoid()
+    h, w = points.shape[1:3]
+    anchor_x = torch.arange(0, 1, 1 / w, device=points.device)[:w]
+    anchor_y = torch.arange(0, 1, 1 / h, device=points.device)[:h, None]
+    absolute_x = points[..., 0] / w + anchor_x
+    absolute_y = points[..., 1] / h + anchor_y
+    return torch.stack((absolute_x, absolute_y), dim=-1)
+def get_mask_windows(
+    height: int, width: int, window_size: int, shift_size: int, device: torch.device
+) -> Tensor:
+    # Create indices for height and width regions
+    h_idx = torch.zeros(height, dtype=torch.long, device=device)
+    h_idx[height - window_size : height - shift_size] = 1
+    h_idx[height - shift_size :] = 2
+    w_idx = torch.zeros(width, dtype=torch.long, device=device)
+    w_idx[width - window_size : width - shift_size] = 1
+    w_idx[width - shift_size :] = 2
+    # Calculate region index for each pixel using broadcasting
+    mask = h_idx.unsqueeze(1) * 3 + w_idx.unsqueeze(0)
+    mask_windows = window_partition(mask[None, ..., None], window_size)
+    return rearrange(mask_windows, "n w1 w2 1 -> n (w1 w2)")
+class WindowCrossAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        src_dim: int,
+        tgt_window_size: int,
+        src_window_size: int,
+        num_heads: int,
+        src_shift_size: int = 0,
+        tgt_shift_size: int = 0,
+        dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.tgt_window_size = tgt_window_size
+        self.src_window_size = src_window_size
+        self.src_shift_size = src_shift_size
+        self.tgt_shift_size = tgt_shift_size
+        self.dropout = dropout
+        self.pe = CayleySTRING(dim, num_heads)
+        self.query = nn.Linear(dim, dim, bias=False)
+        self.kv = nn.Linear(src_dim, dim * 2, bias=False)
+        self.wo = nn.Linear(dim, dim, bias=False)
+    def get_attn_mask(
+        self,
+        height: int,
+        width: int,
+        key_height: int,
+        key_width: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> Tensor | None:
+        if self.tgt_shift_size == 0:
+            return None
+        query_mask = get_mask_windows(
+            height, width, self.tgt_window_size, self.tgt_shift_size, device
+        )
+        key_mask = get_mask_windows(
+            key_height, key_width, self.src_window_size, self.src_shift_size, device
+        )
+        attn_mask = query_mask.unsqueeze(2) - key_mask.unsqueeze(1)
+        return attn_mask.type(dtype).masked_fill(attn_mask != 0, -torch.inf)
+    def forward(
+        self, tgt: Tensor, src: Tensor, tgt_coords: Tensor, src_coord: Tensor
+    ) -> Tensor:
+        b, h, w, c = tgt.shape
+        src_h, src_w = src.shape[1:3]
+        # cyclic shift
+        if self.tgt_shift_size > 0:
+            tgt = tgt.roll(
+                shifts=(-self.tgt_shift_size, -self.tgt_shift_size), dims=(1, 2)
+            )
+            tgt_coords = tgt_coords.roll(
+                shifts=(-self.tgt_shift_size, -self.tgt_shift_size), dims=(1, 2)
+            )
+        if self.src_shift_size > 0:
+            src = src.roll(
+                shifts=(-self.src_shift_size, -self.src_shift_size), dims=(1, 2)
+            )
+            src_coord = src_coord.roll(
+                shifts=(-self.src_shift_size, -self.src_shift_size), dims=(1, 2)
+            )
+        # partition windows
+        tgt = window_partition(tgt, self.tgt_window_size).flatten(1, 2)
+        tgt_coords = window_partition(tgt_coords, self.tgt_window_size).flatten(1, 2)
+        src = window_partition(src, self.src_window_size).flatten(1, 2)
+        src_coord = window_partition(src_coord, self.src_window_size).flatten(1, 2)
+        attn_mask = self.get_attn_mask(h, w, src_h, src_w, tgt.device, tgt.dtype)
+        if attn_mask is not None:
+            attn_mask = repeat(attn_mask, "n l s -> (b n) h l s", b=b, h=self.num_heads)
+        # W-MCA/SW-MCA
+        q = rearrange(self.query(tgt), "b n (h d) -> b h n d", h=self.num_heads)
+        k, v = rearrange(
+            self.kv(src), "b n (two h d) -> two b h n d", two=2, h=self.num_heads
+        )
+        x = F.scaled_dot_product_attention(
+            query=self.pe(q, tgt_coords),
+            key=self.pe(k, src_coord),
+            value=v,
+            attn_mask=attn_mask,
+            dropout_p=self.dropout if self.training else 0.0,
+        )
+        tgt = self.wo(rearrange(x, "b h n d -> b n (h d)"))
+        # merge windows
+        tgt = tgt.view(-1, self.tgt_window_size, self.tgt_window_size, c)
+        tgt = window_reverse(tgt, self.tgt_window_size, h, w)
+        # reverse cyclic shift
+        if self.tgt_shift_size > 0:
+            tgt = torch.roll(
+                tgt, shifts=(self.tgt_shift_size, self.tgt_shift_size), dims=(1, 2)
+            )
+        return tgt
+class WindowSelfAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        window_size: int,
+        num_heads: int,
+        shift_size: int = 0,
+        dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.dropout = dropout
+        self.pe = CayleySTRING(dim, num_heads)
+        self.qkv = nn.Linear(dim, dim * 3, bias=False)
+        self.wo = nn.Linear(dim, dim, bias=False)
+    def get_attn_mask(
+        self, height: int, width: int, device: torch.device, dtype: torch.dtype
+    ) -> Tensor | None:
+        if self.shift_size == 0:
+            return None
+        mask_windows = get_mask_windows(
+            height, width, self.window_size, self.shift_size, device
+        )
+        # Calculate the attention mask based on window differences
+        attn_mask = mask_windows.unsqueeze(2) - mask_windows.unsqueeze(1)
+        return attn_mask.type(dtype).masked_fill(attn_mask != 0, -torch.inf)
+    def forward(self, x: Tensor, coords: Tensor) -> Tensor:
+        """Forward function for Window Self-Attention.
+        Args:
+            x ([b, h, w, c]): Hidden states.
+            coords ([b, h, w, 2]): Absolute positions.
+        """
+        b, h, w, c = x.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            x = x.roll(shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            coords = coords.roll(
+                shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)
+            )
+        # partition windows
+        x = window_partition(x, self.window_size).flatten(1, 2)
+        coords = window_partition(coords, self.window_size).flatten(1, 2)
+        attn_mask = self.get_attn_mask(h, w, x.device, x.dtype)
+        if attn_mask is not None:
+            attn_mask = repeat(attn_mask, "n l s -> (b n) h l s", b=b, h=self.num_heads)
+        # W-MSA/SW-MSA
+        q, k, v = rearrange(
+            self.qkv(x), "b n (three h d) -> three b h n d", three=3, h=self.num_heads
+        )
+        x = F.scaled_dot_product_attention(
+            query=self.pe(q, coords),
+            key=self.pe(k, coords),
+            value=v,
+            attn_mask=attn_mask,
+            dropout_p=self.dropout if self.training else 0.0,
+        )
+        x = self.wo(rearrange(x, "b h n d -> b n (h d)"))
+        # merge windows
+        x = x.view(-1, self.window_size, self.window_size, c)
+        x = window_reverse(x, self.window_size, h, w)
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        return x
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        src_dim: int,
+        num_heads: int,
+        window_size: int,
+        tgt_window_size: int,
+        src_window_size: int,
+        shift_size: int = 0,
+        tgt_shift_size: int = 0,
+        src_shift_size: int = 0,
+        dropout: float = 0.1,
+    ) -> None:
+        super().__init__()
+        self.cross_attention = WindowCrossAttention(
+            dim,
+            src_dim,
+            num_heads=num_heads,
+            tgt_window_size=tgt_window_size,
+            src_window_size=src_window_size,
+            tgt_shift_size=tgt_shift_size,
+            src_shift_size=src_shift_size,
+            dropout=dropout,
+        )
+        self.cross_attention_norm = nn.LayerNorm(dim)
+        self.cross_attention_dropout = nn.Dropout(dropout)
+        self.self_attention = WindowSelfAttention(
+            dim, window_size, num_heads, shift_size, dropout=dropout
+        )
+        self.self_attention_norm = nn.LayerNorm(dim)
+        self.self_attention_dropout = nn.Dropout(dropout)
+        self.ffn = FeedForward(dim, dim * 4)
+        self.ffn_norm = nn.LayerNorm(dim)
+        self.ffn_dropout = nn.Dropout(dropout)
+    def forward(
+        self, tgt: Tensor, src: Tensor, tgt_coords: Tensor, src_coords
+    ) -> Tensor:
+        x = self.self_attention(tgt, tgt_coords)
+        tgt = self.self_attention_norm(tgt + self.self_attention_dropout(x))
+        x = self.cross_attention(tgt, src, tgt_coords, src_coords)
+        tgt = self.cross_attention_norm(tgt + self.cross_attention_dropout(x))
+        return self.ffn_norm(tgt + self.ffn_dropout(self.ffn(tgt)))
+class Stage(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        src_dim: int,
+        depth: int,
+        num_heads: int,
+        window_size: int,
+        tgt_window_size: int,
+        src_window_size: int,
+        dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            block = Block(
+                dim=dim,
+                src_dim=src_dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                tgt_window_size=tgt_window_size,
+                src_window_size=src_window_size,
+                shift_size=0 if i % 2 == 0 else window_size // 2,
+                tgt_shift_size=0 if i % 2 == 0 else tgt_window_size // 2,
+                src_shift_size=0 if i % 2 == 0 else src_window_size // 2,
+                dropout=dropout,
+            )
+            self.blocks.append(block)
+    def forward(
+        self, tgt: Tensor, src: Tensor, tgt_coords: Tensor, src_coords: Tensor
+    ) -> Tensor:
+        for block in self.blocks:
+            tgt = block(tgt, src, tgt_coords, src_coords)
+        return tgt
+class FeatureSampling(nn.Module):
+    def __init__(self, in_dim: int, out_dim: int) -> None:
+        super().__init__()
+        self.reduction = nn.Linear(in_dim, out_dim, bias=False)
+        self.norm = nn.LayerNorm(out_dim)
+    def forward(self, points: Tensor, feature: Tensor) -> Tensor:
+        x = F.grid_sample(feature, points * 2 - 1, align_corners=False)
+        return self.norm(self.reduction(rearrange(x, "b c h w -> b h w c")))
+class LSPTransformer(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_classes: int,
+        query_block_size: int,
+        in_channels: list[int],
+        depths: list[int],
+        num_heads: int,
+        window_size: int,
+        tgt_window_sizes: list[int],
+        src_window_sizes: list[int],
+        num_radial_distances: int,
+        dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.dim = dim
+        self.query_block_size = query_block_size
+        self.num_radial_distances = num_radial_distances
+        bottleneck, *in_channels = in_channels
+        self.feature_sampling = FeatureSampling(bottleneck, dim)
+        self.stages = nn.ModuleList()
+        for i, depth in enumerate(depths):
+            stage = Stage(
+                dim=dim,
+                src_dim=in_channels[i],
+                depth=depth,
+                num_heads=num_heads,
+                window_size=window_size,
+                tgt_window_size=tgt_window_sizes[i],
+                src_window_size=src_window_sizes[i],
+                dropout=dropout,
+            )
+            self.stages.append(stage)
+        self.input_norm = nn.ModuleList(nn.LayerNorm(d) for d in in_channels)
+        # output heads
+        self.class_head = nn.Linear(dim, num_classes + 1, bias=False)
+        self.point_head = MLP(dim, dim, 2, 3)
+        self.radial_distances_head = MLP(dim, dim, num_radial_distances, 3)
+        self.init_weights()
+    def init_weights(self) -> None:
+        # initialize regression layers
+        nn.init.constant_(self.point_head[-1].weight, 0.0)
+        nn.init.constant_(self.point_head[-1].bias, 0.0)
+    def forward(
+        self, multi_scale_features: list[Tensor], height: int, width: int
+    ) -> dict[str, Tensor | list[dict[str, Tensor]]]:
+        *multi_scale_features, bottleneck = multi_scale_features
+        b = bottleneck.size(0)
+        src = []
+        src_coords = []
+        for i, feature in enumerate(reversed(multi_scale_features)):
+            h, w = feature.shape[2:4]
+            coords = torch.zeros(b, h, w, 2, dtype=torch.float32, device=feature.device)
+            src.append(self.input_norm[i](rearrange(feature, "b c h w -> b h w c")))
+            src_coords.append(relative_to_absolute_points(coords, height, width))
+        ref_points = torch.zeros(
+            b,
+            height // self.query_block_size,
+            width // self.query_block_size,
+            2,
+            dtype=torch.float32,
+            device=bottleneck.device,
+        )  # center positions
+        tgt = self.feature_sampling(
+            relative_to_absolute_points_normalized(ref_points), bottleneck
+        )
+        logits_list: list[Tensor] = []
+        ref_points_list: list[Tensor] = []
+        radial_distances_list: list[Tensor] = []
+        new_ref_points = ref_points.clone()  # for look forward twice
+        for i, stage in enumerate(self.stages):
+            tgt = stage(
+                tgt=tgt,
+                src=src[i],
+                tgt_coords=relative_to_absolute_points(ref_points, height, width),
+                src_coords=src_coords[i],
+            )
+            # output heads
+            delta_point = self.point_head(tgt)
+            radial_distances = self.radial_distances_head(tgt)
+            logits = self.class_head(tgt)
+            ref_points_list.append(
+                relative_to_absolute_points_normalized(
+                    new_ref_points + delta_point
+                ).flatten(1, 2)
+            )
+            logits_list.append(logits.flatten(1, 2))
+            radial_distances_list.append(radial_distances.flatten(1, 2))
+            new_ref_points = ref_points + delta_point
+            ref_points = new_ref_points.detach()
+        return {
+            "logits": logits_list[-1],
+            "points": ref_points_list[-1],
+            "radial_distances": radial_distances_list[-1],
+            "polygons": self.get_polygons(
+                relative_to_absolute_points(ref_points, height, width).flatten(1, 2),
+                radial_distances_list[-1],
+            ),
+            "aux_outputs": [
+                {
+                    "logits": a,
+                    "points": b,
+                    "radial_distances": c,
+                }
+                for a, b, c in zip(
+                    logits_list[:-1],
+                    ref_points_list[:-1],
+                    radial_distances_list[:-1],
+                    strict=True,
+                )
+            ],
+        }
+    @torch.no_grad()
+    @torch.autocast("cuda", enabled=False)
+    def get_polygons(self, ref_points: Tensor, radial_distances: Tensor) -> Tensor:
+        t = torch.linspace(
+            0, 1, self.num_radial_distances + 1, device=ref_points.device
+        )[:-1]
+        cos = torch.cos(2 * torch.pi * t)
+        sin = torch.sin(2 * torch.pi * t)
+        radial_distances = radial_distances.expm1()
+        polar = radial_distances.unsqueeze(-1) * torch.stack([sin, cos], dim=-1)
+        return ref_points.unsqueeze(-2) + polar
+class LSPDETR(PreTrainedModel):
+    def __init__(self, config: LSPDETRConfig) -> None:
+        super().__init__(config)
+        self.backbone = Swinv2Backbone.from_pretrained(
+            config.backbone, out_features=["stage1", "stage2", "stage3", "stage4"]
+        )
+        self.decode_head = LSPTransformer(
+            dim=config.dim,
+            num_classes=config.num_classes,
+            query_block_size=config.query_block_size,
+            in_channels=config.in_channels,
+            depths=config.depths,
+            num_heads=config.num_heads,
+            window_size=config.window_size,
+            tgt_window_sizes=config.tgt_window_sizes,
+            src_window_sizes=config.src_window_sizes,
+            num_radial_distances=config.num_radial_distances,
+            dropout=config.dropout,
+        )
+    def forward(self, image: Tensor) -> dict[str, Tensor]:
+        features = self.backbone(image).feature_maps
+        height, width = image.shape[2:]
+        return self.decode_head(features, height, width)