Upload model

Browse files

Files changed (4) hide show

config.json +32 -19
configuration.py +24 -14
model.safetensors +2 -2
modeling.py +292 -363

config.json CHANGED Viewed

@@ -16,31 +16,44 @@
       "stage4"
     ]
   },
-  "depths": [
-    6,
-    2,
-    2
   ],
   "dim": 384,
-  "dropout": 0.1,
   "model_type": "lsp_detr",
-  "num_classes": 1,
   "num_heads": 12,
   "num_radial_distances": 64,
   "query_block_size": 14.222222222222223,
-  "src_window_sizes": [
-    8,
-    16,
-    32
-  ],
-  "tgt_window_sizes": [
-    9,
-    9,
-    9
-  ],
   "torch_dtype": "float32",
-  "transformers_version": "4.51.3",
   "use_pretrained_backbone": true,
-  "use_timm_backbone": false,
-  "window_size": 9
 }

       "stage4"
     ]
   },
+  "cross_sta_config": [
+    {
+      "kernel": 5,
+      "kv_tile": 8,
+      "q_tile": 3
+    },
+    {
+      "kernel": 5,
+      "kv_tile": 4,
+      "q_tile": 3
+    },
+    {
+      "kernel": 5,
+      "kv_tile": 2,
+      "q_tile": 3
+    }
   ],
   "dim": 384,
+  "feature_levels": [
+    2,
+    1,
+    0,
+    2,
+    1,
+    0
+  ],
   "model_type": "lsp_detr",
+  "num_classes": 5,
   "num_heads": 12,
   "num_radial_distances": 64,
   "query_block_size": 14.222222222222223,
+  "self_sta_config": {
+    "kernel": 3,
+    "kv_tile": 3,
+    "q_tile": 3
+  },
   "torch_dtype": "float32",
+  "transformers_version": "4.52.3",
   "use_pretrained_backbone": true,
+  "use_timm_backbone": false
 }

configuration.py CHANGED Viewed

@@ -1,9 +1,15 @@
-from typing import Any
 from transformers import PretrainedConfig
 from transformers.utils.backbone_utils import verify_backbone_config_arguments
 class LSPDetrConfig(PretrainedConfig):
     model_type = "lsp_detr"
@@ -15,17 +21,22 @@ class LSPDetrConfig(PretrainedConfig):
         backbone_kwargs: dict[str, Any] | None = None,
         backbone_config: Any | None = None,
         dim: int = 384,
-        num_classes: int = 1,
-        depths: tuple[int, ...] = (6, 2, 2),
-        query_block_size: int = 16,
         num_heads: int = 12,
-        window_size: int = 8,
-        tgt_window_sizes: tuple[int, ...] = (8, 8, 8),
-        src_window_sizes: tuple[int, ...] = (8, 16, 32),
         num_radial_distances: int = 64,
-        dropout: float = 0.1,
         **kwargs,
     ) -> None:
         if backbone_kwargs is None:
             backbone_kwargs = {"out_features": ["stage1", "stage2", "stage3", "stage4"]}
@@ -43,13 +54,12 @@ class LSPDetrConfig(PretrainedConfig):
         self.backbone_config = backbone_config
         self.backbone_kwargs = backbone_kwargs
         self.dim = dim
         self.num_classes = num_classes
-        self.depths = depths
         self.query_block_size = query_block_size
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.tgt_window_sizes = tgt_window_sizes
-        self.src_window_sizes = src_window_sizes
         self.num_radial_distances = num_radial_distances
-        self.dropout = dropout
         super().__init__(**kwargs)

+from typing import Any, TypedDict
 from transformers import PretrainedConfig
 from transformers.utils.backbone_utils import verify_backbone_config_arguments
+class STAConfig(TypedDict):
+    kernel: int
+    q_tile: int
+    kv_tile: int
 class LSPDetrConfig(PretrainedConfig):
     model_type = "lsp_detr"
         backbone_kwargs: dict[str, Any] | None = None,
         backbone_config: Any | None = None,
         dim: int = 384,
         num_heads: int = 12,
+        num_classes: int = 1,
+        query_block_size: float = 14.222222222222223,  # 256 / 18
+        feature_levels: tuple[int, ...] = (2, 1, 0, 2, 1, 0),
         num_radial_distances: int = 64,
+        self_sta_config: STAConfig | None = None,
+        cross_sta_config: tuple[STAConfig, ...] = (
+            {"kernel": 5, "q_tile": 3, "kv_tile": 8},
+            {"kernel": 5, "q_tile": 3, "kv_tile": 4},
+            {"kernel": 5, "q_tile": 3, "kv_tile": 2},
+        ),
         **kwargs,
     ) -> None:
+        if self_sta_config is None:
+            self_sta_config = {"kernel": 3, "q_tile": 3, "kv_tile": 3}
         if backbone_kwargs is None:
             backbone_kwargs = {"out_features": ["stage1", "stage2", "stage3", "stage4"]}
         self.backbone_config = backbone_config
         self.backbone_kwargs = backbone_kwargs
         self.dim = dim
+        self.num_heads = num_heads
         self.num_classes = num_classes
         self.query_block_size = query_block_size
+        self.feature_levels = feature_levels
         self.num_radial_distances = num_radial_distances
+        self.self_sta_config = self_sta_config
+        self.cross_sta_config = cross_sta_config
         super().__init__(**kwargs)

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f1366763c506eadd8933ed22481c4f11ab6d2984ac78f91157c461d3ad59c526
-size 204465704

 version https://git-lfs.github.com/spec/v1
+oid sha256:3f5437eb889a864ff88ae121ed7581217778b30430657ce39751a7f5e4b96082
+size 180151024

modeling.py CHANGED Viewed

@@ -1,67 +1,24 @@
 import math
 import torch
 import torch.nn.functional as F
-from einops import rearrange, repeat
 from torch import Tensor, nn
 from torch.nn.utils import parametrize
-from transformers import PreTrainedModel
-from transformers.models.swinv2.modeling_swinv2 import window_partition, window_reverse
 from transformers.utils.backbone_utils import load_backbone
-from .configuration import LSPDetrConfig
-class MLP(nn.Sequential):
-    """Very simple multi-layer perceptron."""
-    def __init__(
-        self,
-        input_dim: int,
-        hidden_dim: int,
-        output_dim: int,
-        num_layers: int,
-        act_layer: type[nn.Module] = nn.GELU,
-        dropout: float = 0.0,
-    ) -> None:
-        assert num_layers > 1
-        layers = []
-        h = [hidden_dim] * (num_layers - 1)
-        for n, k in zip([input_dim, *h], h, strict=False):
-            layers.append(nn.Linear(n, k))
-            layers.append(act_layer())
-            if dropout > 0:
-                layers.append(nn.Dropout(dropout))
-        layers.append(nn.Linear(hidden_dim, output_dim))
-        super().__init__(*layers)
-class FeedForward(nn.Module):
-    """FeedForward module.
-    Taken from https://github.com/meta-llama/llama-models/blob/main/models/llama4/ffn.py
-    """
-    def __init__(self, dim: int, hidden_dim: int, multiple_of: int = 256) -> None:
-        """Initialize the FeedForward module.
-        Args:
-            dim (int): Input dimension.
-            hidden_dim (int): Hidden dimension of the feedforward layer.
-            multiple_of (int): Value to ensure hidden dimension is a multiple of this value.
-        """
-        super().__init__()
-        hidden_dim = int(2 * hidden_dim / 3)
-        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
-        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
-        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
-        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
-    def forward(self, x: Tensor) -> Tensor:
-        return self.w2(F.silu(self.w1(x)) * self.w3(x))
 def init_freqs(head_dim: int, num_heads: int, pos_dim: int, theta: float) -> Tensor:
@@ -134,6 +91,11 @@ class CayleySTRING(nn.Module):
     def init_weights(self) -> None:
         self.S = nn.init.kaiming_uniform_(self.S, a=math.sqrt(5))
     @parametrize.cached()
     @torch.autocast("cuda", enabled=False)
     def forward(self, x: Tensor, positions: Tensor) -> Tensor:
@@ -144,13 +106,18 @@ class CayleySTRING(nn.Module):
             positions ([b, n, pos_dim]): Positions tensor.
         """
         # Compute (I + S)^-1 @ x
-        y = torch.linalg.solve(
-            self.I + self.S, rearrange(x.float(), "b h n d -> h d (b n)")
-        )
-        # change of basis
-        px = torch.matmul(self.I - self.S, y)
-        px = rearrange(px, "h d (b n) -> b h n d", b=x.size(0)).contiguous()
         # apply RoPE-Mixed
         angles = torch.einsum("bnk,khc->bhnc", positions, self.freqs)
@@ -161,11 +128,123 @@ class CayleySTRING(nn.Module):
         return out.type_as(x)
-def maybe_pad(x: Tensor, window_size: int) -> Tensor:
-    h, w = x.shape[1:3]
-    pad_right = (window_size - w % window_size) % window_size
-    pad_bottom = (window_size - h % window_size) % window_size
-    return F.pad(x, (0, 0, 0, pad_right, 0, pad_bottom))
 @torch.autocast("cuda", enabled=False)
@@ -181,315 +260,142 @@ def relative_to_absolute_pos(pos: Tensor, step_x: float, step_y: float) -> Tenso
     return torch.stack((absolute_x, absolute_y), dim=-1)
-def get_mask_windows(
-    height: int, width: int, window_size: int, shift_size: int, device: torch.device
-) -> Tensor:
-    # Create indices for height and width regions
-    h_idx = torch.zeros(height, dtype=torch.long, device=device)
-    h_idx[height - window_size : height - shift_size] = 1
-    h_idx[height - shift_size :] = 2
-    w_idx = torch.zeros(width, dtype=torch.long, device=device)
-    w_idx[width - window_size : width - shift_size] = 1
-    w_idx[width - shift_size :] = 2
-    # Calculate region index for each pixel using broadcasting
-    mask = h_idx.unsqueeze(1) * 3 + w_idx.unsqueeze(0)
-    mask_windows = window_partition(mask[None, ..., None], window_size)
-    return rearrange(mask_windows, "n w1 w2 1 -> n (w1 w2)")
-class WindowCrossAttention(nn.Module):
     def __init__(
         self,
         dim: int,
         src_dim: int,
-        tgt_window_size: int,
-        src_window_size: int,
         num_heads: int,
-        src_shift_size: int = 0,
-        tgt_shift_size: int = 0,
-        dropout: float = 0.0,
     ) -> None:
         super().__init__()
         self.num_heads = num_heads
-        self.tgt_window_size = tgt_window_size
-        self.src_window_size = src_window_size
-        self.src_shift_size = src_shift_size
-        self.tgt_shift_size = tgt_shift_size
-        self.dropout = dropout
         self.pe = CayleySTRING(dim, num_heads)
-        self.query = nn.Linear(dim, dim, bias=False)
         self.kv = nn.Linear(src_dim, dim * 2, bias=False)
         self.wo = nn.Linear(dim, dim, bias=False)
-    def get_attn_mask(
-        self,
-        height: int,
-        width: int,
-        key_height: int,
-        key_width: int,
-        device: torch.device,
-        dtype: torch.dtype,
-    ) -> Tensor | None:
-        if self.tgt_shift_size == 0:
-            return None
-        query_mask = get_mask_windows(
-            height, width, self.tgt_window_size, self.tgt_shift_size, device
-        )
-        key_mask = get_mask_windows(
-            key_height, key_width, self.src_window_size, self.src_shift_size, device
         )
-        attn_mask = query_mask.unsqueeze(2) - key_mask.unsqueeze(1)
-        return attn_mask.type(dtype).masked_fill(attn_mask != 0, -torch.inf)
     def forward(
-        self, tgt: Tensor, src: Tensor, tgt_coords: Tensor, src_coord: Tensor
     ) -> Tensor:
-        b, h, w, c = tgt.shape
-        # pad to multiples of window size
-        tgt = maybe_pad(tgt, self.tgt_window_size)
-        src = maybe_pad(src, self.src_window_size)
-        tgt_coords = maybe_pad(tgt_coords, self.tgt_window_size)
-        src_coord = maybe_pad(src_coord, self.src_window_size)
-        h_pad, w_pad = tgt.shape[1:3]
-        src_h, src_w = src.shape[1:3]
-        # cyclic shift
-        if self.tgt_shift_size > 0:
-            tgt = tgt.roll(
-                shifts=(-self.tgt_shift_size, -self.tgt_shift_size), dims=(1, 2)
-            )
-            tgt_coords = tgt_coords.roll(
-                shifts=(-self.tgt_shift_size, -self.tgt_shift_size), dims=(1, 2)
-            )
-        if self.src_shift_size > 0:
-            src = src.roll(
-                shifts=(-self.src_shift_size, -self.src_shift_size), dims=(1, 2)
-            )
-            src_coord = src_coord.roll(
-                shifts=(-self.src_shift_size, -self.src_shift_size), dims=(1, 2)
-            )
-        # partition windows
-        tgt = window_partition(tgt, self.tgt_window_size).flatten(1, 2)
-        src = window_partition(src, self.src_window_size).flatten(1, 2)
-        tgt_coords = window_partition(tgt_coords, self.tgt_window_size).flatten(1, 2)
-        src_coord = window_partition(src_coord, self.src_window_size).flatten(1, 2)
-        attn_mask = self.get_attn_mask(
-            h_pad, w_pad, src_h, src_w, tgt.device, tgt.dtype
         )
-        if attn_mask is not None:
-            attn_mask = repeat(attn_mask, "n l s -> (b n) h l s", b=b, h=self.num_heads)
-        # W-MCA/SW-MCA
-        q = rearrange(self.query(tgt), "b n (h d) -> b h n d", h=self.num_heads)
         k, v = rearrange(
-            self.kv(src), "b n (two h d) -> two b h n d", two=2, h=self.num_heads
         )
-        x = F.scaled_dot_product_attention(
-            query=self.pe(q, tgt_coords),
-            key=self.pe(k, src_coord),
-            value=v,
-            attn_mask=attn_mask,
-            dropout_p=self.dropout if self.training else 0.0,
-        )
-        tgt = self.wo(rearrange(x, "b h n d -> b n (h d)"))
-        # merge windows
-        tgt = tgt.view(-1, self.tgt_window_size, self.tgt_window_size, c)
-        tgt = window_reverse(tgt, self.tgt_window_size, h_pad, w_pad)
-        # reverse cyclic shift
-        if self.tgt_shift_size > 0:
-            tgt = torch.roll(
-                tgt, shifts=(self.tgt_shift_size, self.tgt_shift_size), dims=(1, 2)
-            )
-        return tgt[:, :h, :w, :].contiguous()  # remove padding
-class WindowSelfAttention(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        window_size: int,
-        num_heads: int,
-        shift_size: int = 0,
-        dropout: float = 0.0,
-    ) -> None:
-        super().__init__()
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.shift_size = shift_size
-        self.dropout = dropout
-        self.pe = CayleySTRING(dim, num_heads)
-        self.qkv = nn.Linear(dim, dim * 3, bias=False)
-        self.wo = nn.Linear(dim, dim, bias=False)
-    def get_attn_mask(
-        self, height: int, width: int, device: torch.device, dtype: torch.dtype
-    ) -> Tensor | None:
-        if self.shift_size == 0:
-            return None
-        mask_windows = get_mask_windows(
-            height, width, self.window_size, self.shift_size, device
-        )
-        # Calculate the attention mask based on window differences
-        attn_mask = mask_windows.unsqueeze(2) - mask_windows.unsqueeze(1)
-        return attn_mask.type(dtype).masked_fill(attn_mask != 0, -torch.inf)
-    def forward(self, x: Tensor, coords: Tensor) -> Tensor:
-        """Forward function for Window Self-Attention.
-        Args:
-            x ([b, h, w, c]): Hidden states.
-            coords ([b, h, w, 2]): Absolute positions.
-        """
-        b, h, w, c = x.shape
-        # pad to multiples of window size
-        x = maybe_pad(x, self.window_size)
-        coords = maybe_pad(coords, self.window_size)
-        h_pad, w_pad = x.shape[1:3]
-        # cyclic shift
-        if self.shift_size > 0:
-            x = x.roll(shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
-            coords = coords.roll(
-                shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)
-            )
-        # partition windows
-        x = window_partition(x, self.window_size).flatten(1, 2)
-        coords = window_partition(coords, self.window_size).flatten(1, 2)
-        attn_mask = self.get_attn_mask(h_pad, w_pad, x.device, x.dtype)
-        if attn_mask is not None:
-            attn_mask = repeat(attn_mask, "n l s -> (b n) h l s", b=b, h=self.num_heads)
-        # W-MSA/SW-MSA
-        q, k, v = rearrange(
-            self.qkv(x), "b n (three h d) -> three b h n d", three=3, h=self.num_heads
         )
-        x = F.scaled_dot_product_attention(
-            query=self.pe(q, coords),
-            key=self.pe(k, coords),
-            value=v,
-            attn_mask=attn_mask,
-            dropout_p=self.dropout if self.training else 0.0,
         )
-        x = self.wo(rearrange(x, "b h n d -> b n (h d)"))
-        # merge windows
-        x = x.view(-1, self.window_size, self.window_size, c)
-        x = window_reverse(x, self.window_size, h_pad, w_pad)
-        # reverse cyclic shift
-        if self.shift_size > 0:
-            x = torch.roll(x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
-        return x[:, :h, :w, :].contiguous()  # remove padding
-class Block(nn.Module):
     def __init__(
         self,
         dim: int,
         src_dim: int,
         num_heads: int,
-        window_size: int,
-        tgt_window_size: int,
-        src_window_size: int,
-        shift_size: int = 0,
-        tgt_shift_size: int = 0,
-        src_shift_size: int = 0,
-        dropout: float = 0.1,
     ) -> None:
         super().__init__()
-        self.cross_attention = WindowCrossAttention(
             dim,
-            src_dim,
-            num_heads=num_heads,
-            tgt_window_size=tgt_window_size,
-            src_window_size=src_window_size,
-            tgt_shift_size=tgt_shift_size,
-            src_shift_size=src_shift_size,
-            dropout=dropout,
         )
-        self.cross_attention_norm = nn.LayerNorm(dim)
-        self.cross_attention_dropout = nn.Dropout(dropout)
-        self.self_attention = WindowSelfAttention(
-            dim, window_size, num_heads, shift_size, dropout=dropout
         )
-        self.self_attention_norm = nn.LayerNorm(dim)
-        self.self_attention_dropout = nn.Dropout(dropout)
         self.ffn = FeedForward(dim, dim * 4)
         self.ffn_norm = nn.LayerNorm(dim)
-        self.ffn_dropout = nn.Dropout(dropout)
     def forward(
-        self, tgt: Tensor, src: Tensor, tgt_coords: Tensor, src_coords
     ) -> Tensor:
-        x = self.self_attention(tgt, tgt_coords)
-        tgt = self.self_attention_norm(tgt + self.self_attention_dropout(x))
         x = self.cross_attention(tgt, src, tgt_coords, src_coords)
-        tgt = self.cross_attention_norm(tgt + self.cross_attention_dropout(x))
-        return self.ffn_norm(tgt + self.ffn_dropout(self.ffn(tgt)))
-class Stage(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        src_dim: int,
-        depth: int,
-        num_heads: int,
-        window_size: int,
-        tgt_window_size: int,
-        src_window_size: int,
-        dropout: float = 0.0,
-    ) -> None:
-        super().__init__()
-        self.blocks = nn.ModuleList()
-        for i in range(depth):
-            block = Block(
-                dim=dim,
-                src_dim=src_dim,
-                num_heads=num_heads,
-                window_size=window_size,
-                tgt_window_size=tgt_window_size,
-                src_window_size=src_window_size,
-                shift_size=0 if i % 2 == 0 else window_size // 2,
-                tgt_shift_size=0 if i % 2 == 0 else tgt_window_size // 2,
-                src_shift_size=0 if i % 2 == 0 else src_window_size // 2,
-                dropout=dropout,
-            )
-            self.blocks.append(block)
-    def forward(
-        self, tgt: Tensor, src: Tensor, tgt_coords: Tensor, src_coords: Tensor
-    ) -> Tensor:
-        for block in self.blocks:
-            tgt = block(tgt, src, tgt_coords, src_coords)
-        return tgt
 class LSPTransformer(nn.Module):
@@ -498,36 +404,45 @@ class LSPTransformer(nn.Module):
         self.query_block_size = config.query_block_size
         self.num_radial_distances = config.num_radial_distances
-        self.stages = nn.ModuleList()
-        for i, depth in enumerate(config.depths):
-            stage = Stage(
                 dim=config.dim,
-                src_dim=feature_channels[i],
-                depth=depth,
                 num_heads=config.num_heads,
-                window_size=config.window_size,
-                tgt_window_size=config.tgt_window_sizes[i],
-                src_window_size=config.src_window_sizes[i],
-                dropout=config.dropout,
             )
-            self.stages.append(stage)
-        self.input_norm = nn.ModuleList(nn.LayerNorm(d) for d in feature_channels)
         # output heads
-        self.class_head = nn.Linear(config.dim, config.num_classes + 1, bias=False)
-        self.point_head = MLP(config.dim, config.dim, 2, 2)
-        self.radial_distances_head = MLP(
-            config.dim, config.dim, config.num_radial_distances, 2
         )
         self.init_weights()
     def init_weights(self) -> None:
         # initialize regression layers
-        nn.init.constant_(self.point_head[-1].weight, 0.0)
-        nn.init.constant_(self.point_head[-1].bias, 0.0)
     def forward(
         self,
@@ -539,34 +454,44 @@ class LSPTransformer(nn.Module):
     ) -> dict[str, Tensor | list[dict[str, Tensor]]]:
         src = []
         src_coords = []
-        for i, feature in enumerate(features):
             b, _, h, w = feature.shape
             coords = torch.zeros(b, h, w, 2, dtype=torch.float32, device=feature.device)
-            src.append(self.input_norm[i](rearrange(feature, "b c h w -> b h w c")))
-            src_coords.append(
-                relative_to_absolute_pos(
-                    coords, step_x=math.ceil(width / w), step_y=math.ceil(height / h)
-                )
             )
         logits_list: list[Tensor] = []
         ref_points_list: list[Tensor] = []
         radial_distances_list: list[Tensor] = []
-        new_ref_points = ref_points.clone()  # for look forward twice
-        for i, stage in enumerate(self.stages):
-            tgt = stage(
                 tgt=tgt,
-                src=src[i],
                 tgt_coords=relative_to_absolute_pos(
                     ref_points, self.query_block_size, self.query_block_size
-                ),
-                src_coords=src_coords[i],
             )
             # output heads
-            delta_point = self.point_head(tgt)
-            radial_distances = self.radial_distances_head(tgt)
             logits = self.class_head(tgt)
             ref_points_list.append(
@@ -577,10 +502,14 @@ class LSPTransformer(nn.Module):
                 ).flatten(1, 2)
             )
             logits_list.append(logits.flatten(1, 2))
-            radial_distances_list.append(radial_distances.flatten(1, 2))
             new_ref_points = ref_points + delta_point
             ref_points = new_ref_points.detach()
         return {
             "logits": logits_list[-1],
@@ -608,12 +537,12 @@ class LSPTransformer(nn.Module):
 class FeatureSampling(nn.Module):
     def __init__(self, in_dim: int, out_dim: int) -> None:
         super().__init__()
-        self.reduction = nn.Linear(in_dim, out_dim, bias=False)
         self.norm = nn.LayerNorm(out_dim)
     def forward(self, points: Tensor, feature: Tensor) -> Tensor:
-        x = F.grid_sample(feature, points * 2 - 1, align_corners=False)
-        return self.norm(self.reduction(rearrange(x, "b c h w -> b h w c")))
 class LSPDetrModel(PreTrainedModel):
@@ -627,7 +556,7 @@ class LSPDetrModel(PreTrainedModel):
         _, *feature_channels, neck = self.backbone.num_features
         self.feature_sampling = FeatureSampling(neck, config.dim)
-        self.decode_head = LSPTransformer(config, feature_channels[::-1])
     def forward(self, pixel_values: Tensor) -> dict[str, Tensor]:
         b, _, h, w = pixel_values.shape
@@ -649,4 +578,4 @@ class LSPDetrModel(PreTrainedModel):
             neck,
         )
-        return self.decode_head(tgt, ref_points, features[::-1], h, w)

 import math
+from functools import cached_property, lru_cache
 import torch
 import torch.nn.functional as F
+from einops import rearrange
 from torch import Tensor, nn
+from torch.nn.attention.flex_attention import (
+    BlockMask,
+    _mask_mod_signature,
+    create_block_mask,
+    flex_attention,
+)
 from torch.nn.utils import parametrize
+from transformers.modeling_utils import PreTrainedModel
 from transformers.utils.backbone_utils import load_backbone
+from .configuration import LSPDetrConfig, STAConfig
+flex_attention = torch.compile(flex_attention, dynamic=True)
 def init_freqs(head_dim: int, num_heads: int, pos_dim: int, theta: float) -> Tensor:
     def init_weights(self) -> None:
         self.S = nn.init.kaiming_uniform_(self.S, a=math.sqrt(5))
+    @cached_property
+    def P(self) -> Tensor:
+        i_plus_s_inv = torch.linalg.inv(self.I + self.S)
+        return torch.matmul(self.I - self.S, i_plus_s_inv)
     @parametrize.cached()
     @torch.autocast("cuda", enabled=False)
     def forward(self, x: Tensor, positions: Tensor) -> Tensor:
             positions ([b, n, pos_dim]): Positions tensor.
         """
         # Compute (I + S)^-1 @ x
+        if self.training:
+            # Use linalg.solve during training for numerical stability.
+            y = torch.linalg.solve(
+                self.I + self.S, rearrange(x.float(), "b h n d -> (b h) d n")
+            )
+            px = torch.matmul(self.I - self.S, y)
+            px = rearrange(px, "(b h) d n -> b h n d", b=x.size(0))
+        else:
+            # During inference, use the pre-calculated matrix P for performance.
+            px = x.float() @ self.P.T
+        px = px.contiguous()
         # apply RoPE-Mixed
         angles = torch.einsum("bnk,khc->bhnc", positions, self.freqs)
         return out.type_as(x)
+class MLP(nn.Sequential):
+    """Very simple multi-layer perceptron."""
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        num_layers: int,
+        act_layer: type[nn.Module] = nn.GELU,
+        dropout: float = 0.0,
+    ) -> None:
+        assert num_layers > 1
+        layers = []
+        h = [hidden_dim] * (num_layers - 1)
+        for n, k in zip([input_dim, *h], h, strict=False):
+            layers.append(nn.Linear(n, k))
+            layers.append(act_layer())
+            if dropout > 0:
+                layers.append(nn.Dropout(dropout))
+        layers.append(nn.Linear(hidden_dim, output_dim))
+        super().__init__(*layers)
+class FeedForward(nn.Module):
+    """FeedForward module.
+    Taken from https://github.com/meta-llama/llama-models/blob/main/models/llama4/ffn.py
+    """
+    def __init__(self, dim: int, hidden_dim: int, multiple_of: int = 256) -> None:
+        """Initialize the FeedForward module.
+        Args:
+            dim (int): Input dimension.
+            hidden_dim (int): Hidden dimension of the feedforward layer.
+            multiple_of (int): Value to ensure hidden dimension is a multiple of this value.
+        """
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+def generate_sta_mask(
+    q_canvas_w: int,
+    kv_canvas_hw: tuple[int, int],
+    kernel: int,
+    q_tile: int,
+    kv_tile: int,
+) -> _mask_mod_signature:
+    q_canvas_tile_w = q_canvas_w // q_tile
+    kv_canvas_tile_h = kv_canvas_hw[0] // kv_tile
+    kv_canvas_tile_w = kv_canvas_hw[1] // kv_tile
+    def q_tile_rescale(x: Tensor):
+        # Computes round(x * (kv_canvas_tile_w - 1) / (q_canvas_tile_w - 1))
+        scale_numerator = kv_canvas_tile_w - 1
+        scale_denominator = q_canvas_tile_w - 1
+        return (x * scale_numerator + scale_denominator // 2) // scale_denominator
+    def get_tile_xy(
+        idx: Tensor, tile_size: int, canvas_tile_w: int
+    ) -> tuple[Tensor, Tensor]:
+        tile_id = idx // (tile_size * tile_size)
+        tile_x = tile_id % canvas_tile_w
+        tile_y = tile_id // canvas_tile_w
+        return tile_x, tile_y
+    def sta_mask_2d(b: Tensor, h: Tensor, q_idx: Tensor, kv_idx: Tensor) -> Tensor:
+        q_x_tile, q_y_tile = get_tile_xy(q_idx, q_tile, q_canvas_tile_w)
+        kv_x_tile, kv_y_tile = get_tile_xy(kv_idx, kv_tile, kv_canvas_tile_w)
+        q_x_tile = q_tile_rescale(q_x_tile)
+        q_y_tile = q_tile_rescale(q_y_tile)
+        center_x = q_x_tile.clamp(kernel // 2, (kv_canvas_tile_w - 1) - kernel // 2)
+        center_y = q_y_tile.clamp(kernel // 2, (kv_canvas_tile_h - 1) - kernel // 2)
+        # Apply kernel mask in canvas coordinates (not tile coordinates)
+        x_mask = torch.abs(center_x - kv_x_tile) <= kernel // 2
+        y_mask = torch.abs(center_y - kv_y_tile) <= kernel // 2
+        return x_mask & y_mask
+    return sta_mask_2d
+@lru_cache
+def create_sta_block_mask(
+    q_len: int,
+    kv_len: int,
+    q_width: int,
+    kv_width: int,
+    kernel: int,
+    q_tile: int,
+    kv_tile: int,
+) -> BlockMask:
+    return create_block_mask(
+        generate_sta_mask(
+            q_width, (kv_len // kv_width, kv_width), kernel, q_tile, kv_tile
+        ),
+        B=None,
+        H=None,
+        device="cuda" if torch.cuda.is_available() else "cpu",
+        Q_LEN=q_len,
+        KV_LEN=kv_len,
+        _compile=True,
+    )
 @torch.autocast("cuda", enabled=False)
     return torch.stack((absolute_x, absolute_y), dim=-1)
+class STAttention(nn.Module):
     def __init__(
         self,
         dim: int,
         src_dim: int,
         num_heads: int,
+        kernel: int,
+        q_tile: int,
+        kv_tile: int,
     ) -> None:
         super().__init__()
         self.num_heads = num_heads
+        self.kernel = kernel
+        self.q_tile = q_tile
+        self.kv_tile = kv_tile
         self.pe = CayleySTRING(dim, num_heads)
+        self.q = nn.Linear(dim, dim, bias=False)
         self.kv = nn.Linear(src_dim, dim * 2, bias=False)
         self.wo = nn.Linear(dim, dim, bias=False)
+    def maybe_pad(self, x: Tensor, tile: int) -> Tensor:
+        h, w = x.shape[1:3]
+        pad_right = (tile - w % tile) % tile
+        pad_bottom = (tile - h % tile) % tile
+        return F.pad(x, (0, 0, 0, pad_right, 0, pad_bottom))
+    def tile(self, x: Tensor, height: int, tile: int) -> tuple[Tensor, int, int]:
+        x = rearrange(x, "b head (h w) dim -> b h w (head dim)", h=height)
+        x = self.maybe_pad(x, tile)
+        h, w = x.shape[1:3]
+        x = rearrange(
+            x,
+            "b (n_h ts_h) (n_w ts_w) (h d) -> b h (n_h n_w ts_h ts_w) d",
+            ts_h=tile,
+            ts_w=tile,
+            h=self.num_heads,
         )
+        return x, h, w
     def forward(
+        self, tgt: Tensor, src: Tensor, q_coords: Tensor, k_coords: Tensor
     ) -> Tensor:
+        h, w = tgt.shape[1:3]
+        q = rearrange(
+            self.q(tgt), "b h w (head d) -> b head (h w) d", head=self.num_heads
         )
         k, v = rearrange(
+            self.kv(src),
+            "b h w (two head d) -> two b head (h w) d",
+            two=2,
+            head=self.num_heads,
         )
+        # RoPE
+        q = self.pe(q, q_coords)
+        k = self.pe(k, k_coords)
+        # tile
+        q, q_h, q_w = self.tile(q, h, self.q_tile)
+        k, _, kv_w = self.tile(k, src.shape[1], self.kv_tile)
+        v, _, _ = self.tile(v, src.shape[1], self.kv_tile)
+        # flex attention
+        block_mask = create_sta_block_mask(
+            q_len=q.shape[2],
+            kv_len=k.shape[2],
+            q_width=q_w,
+            kv_width=kv_w,
+            kernel=self.kernel,
+            q_tile=self.q_tile,
+            kv_tile=self.kv_tile,
         )
+        x = flex_attention(q, k, v, block_mask=block_mask)
+        # un-tile
+        x = rearrange(
+            x,
+            "b h (n_h n_w ts_h ts_w) d -> b (n_h ts_h) (n_w ts_w) (h d)",
+            n_h=q_h // self.q_tile,
+            n_w=q_w // self.q_tile,
+            ts_h=self.q_tile,
+            ts_w=self.q_tile,
         )
+        # remove padding
+        x = x[:, :h, :w, :].contiguous()
+        return self.wo(x)
+class Layer(nn.Module):
     def __init__(
         self,
         dim: int,
         src_dim: int,
         num_heads: int,
+        self_sta_config: STAConfig,
+        cross_sta_config: STAConfig,
     ) -> None:
         super().__init__()
+        self.self_attention = STAttention(
             dim,
+            dim,
+            num_heads,
+            kernel=self_sta_config["kernel"],
+            q_tile=self_sta_config["q_tile"],
+            kv_tile=self_sta_config["kv_tile"],
         )
+        self.self_attention_norm = nn.LayerNorm(dim)
+        self.cross_attention = STAttention(
+            dim,
+            src_dim,
+            num_heads,
+            kernel=cross_sta_config["kernel"],
+            q_tile=cross_sta_config["q_tile"],
+            kv_tile=cross_sta_config["kv_tile"],
         )
+        self.cross_attention_norm = nn.LayerNorm(dim)
         self.ffn = FeedForward(dim, dim * 4)
         self.ffn_norm = nn.LayerNorm(dim)
     def forward(
+        self, tgt: Tensor, src: Tensor, tgt_coords: Tensor, src_coords: Tensor
     ) -> Tensor:
+        x = self.self_attention(tgt, tgt, tgt_coords, tgt_coords)
+        tgt = self.self_attention_norm(tgt + x)
         x = self.cross_attention(tgt, src, tgt_coords, src_coords)
+        tgt = self.cross_attention_norm(tgt + x)
+        return self.ffn_norm(tgt + self.ffn(tgt))
 class LSPTransformer(nn.Module):
         self.query_block_size = config.query_block_size
         self.num_radial_distances = config.num_radial_distances
+        self.feature_levels = config.feature_levels
+        self.num_classes = config.num_classes + 1
+        self.layers = nn.ModuleList()
+        for level in config.feature_levels:
+            layer = Layer(
                 dim=config.dim,
+                src_dim=feature_channels[level],
                 num_heads=config.num_heads,
+                self_sta_config=config.self_sta_config,
+                cross_sta_config=config.cross_sta_config[level],
             )
+            self.layers.append(layer)
         # output heads
+        self.class_head = nn.Linear(config.dim, self.num_classes)
+        self.point_head = nn.ModuleList(
+            MLP(config.dim, config.dim, 2, 3) for _ in config.feature_levels
+        )
+        self.radial_distances_head = nn.ModuleList(
+            MLP(config.dim, config.dim, config.num_radial_distances, 3)
+            for _ in config.feature_levels
         )
         self.init_weights()
     def init_weights(self) -> None:
+        prior_prob = 0.01
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        self.class_head.bias.data = torch.ones(self.num_classes) * bias_value
         # initialize regression layers
+        for head in self.point_head:
+            nn.init.constant_(head[-1].weight, 0)
+            nn.init.constant_(head[-1].bias, 0)
+        for head in self.radial_distances_head:
+            nn.init.constant_(head[-1].weight, 0)
+            nn.init.constant_(head[-1].bias, 0)
     def forward(
         self,
     ) -> dict[str, Tensor | list[dict[str, Tensor]]]:
         src = []
         src_coords = []
+        for feature in features:
             b, _, h, w = feature.shape
             coords = torch.zeros(b, h, w, 2, dtype=torch.float32, device=feature.device)
+            coords = relative_to_absolute_pos(
+                coords, step_x=math.ceil(width / w), step_y=math.ceil(height / h)
             )
+            # the outputs from SwinV2 are already normalized
+            src.append(rearrange(feature, "b c h w -> b h w c"))
+            src_coords.append(rearrange(coords, "b h w pos -> b (h w) pos"))
+        radial_distances = torch.full(
+            (*tgt.shape[:3], self.num_radial_distances),
+            math.log1p(self.query_block_size / 2),
+            dtype=torch.float32,
+            device=tgt.device,
+        )
         logits_list: list[Tensor] = []
         ref_points_list: list[Tensor] = []
         radial_distances_list: list[Tensor] = []
+        # for look forward twice
+        new_ref_points = ref_points.clone()
+        new_radial_distances = radial_distances.clone()
+        for i, layer in enumerate(self.layers):
+            tgt = layer(
                 tgt=tgt,
+                src=src[self.feature_levels[i]],
                 tgt_coords=relative_to_absolute_pos(
                     ref_points, self.query_block_size, self.query_block_size
+                ).flatten(1, 2),
+                src_coords=src_coords[self.feature_levels[i]],
             )
             # output heads
+            delta_point = self.point_head[i](tgt)
+            delta_distances = self.radial_distances_head[i](tgt)
             logits = self.class_head(tgt)
             ref_points_list.append(
                 ).flatten(1, 2)
             )
             logits_list.append(logits.flatten(1, 2))
+            radial_distances_list.append(
+                torch.flatten(new_radial_distances + delta_distances, 1, 2)
+            )
             new_ref_points = ref_points + delta_point
+            new_radial_distances = radial_distances + delta_distances
             ref_points = new_ref_points.detach()
+            radial_distances = new_radial_distances.detach()
         return {
             "logits": logits_list[-1],
 class FeatureSampling(nn.Module):
     def __init__(self, in_dim: int, out_dim: int) -> None:
         super().__init__()
+        self.reduction = nn.Conv2d(in_dim, out_dim, kernel_size=1, bias=False)
         self.norm = nn.LayerNorm(out_dim)
     def forward(self, points: Tensor, feature: Tensor) -> Tensor:
+        x = F.grid_sample(self.reduction(feature), points * 2 - 1, align_corners=False)
+        return self.norm(rearrange(x, "b c h w -> b h w c"))
 class LSPDetrModel(PreTrainedModel):
         _, *feature_channels, neck = self.backbone.num_features
         self.feature_sampling = FeatureSampling(neck, config.dim)
+        self.decode_head = LSPTransformer(config, feature_channels)
     def forward(self, pixel_values: Tensor) -> dict[str, Tensor]:
         b, _, h, w = pixel_values.shape
             neck,
         )
+        return self.decode_head(tgt, ref_points, features, h, w)