Upload model

Browse files

Files changed (4) hide show

config.json +9 -9
configuration.py +5 -5
model.safetensors +2 -2
modeling.py +18 -84

config.json CHANGED Viewed

@@ -20,17 +20,17 @@
     {
       "kernel": 5,
       "kv_tile": 8,
-      "q_tile": 3
     },
     {
       "kernel": 5,
       "kv_tile": 4,
-      "q_tile": 3
     },
     {
       "kernel": 5,
       "kv_tile": 2,
-      "q_tile": 3
     }
   ],
   "dim": 384,
@@ -43,17 +43,17 @@
     0
   ],
   "model_type": "lsp_detr",
-  "num_classes": 5,
   "num_heads": 12,
   "num_radial_distances": 64,
-  "query_block_size": 14.222222222222223,
   "self_sta_config": {
-    "kernel": 3,
-    "kv_tile": 3,
-    "q_tile": 3
   },
   "torch_dtype": "float32",
-  "transformers_version": "4.52.3",
   "use_pretrained_backbone": true,
   "use_timm_backbone": false
 }

     {
       "kernel": 5,
       "kv_tile": 8,
+      "q_tile": 4
     },
     {
       "kernel": 5,
       "kv_tile": 4,
+      "q_tile": 4
     },
     {
       "kernel": 5,
       "kv_tile": 2,
+      "q_tile": 4
     }
   ],
   "dim": 384,
     0
   ],
   "model_type": "lsp_detr",
+  "num_classes": 1,
   "num_heads": 12,
   "num_radial_distances": 64,
+  "query_block_size": 8,
   "self_sta_config": {
+    "kernel": 5,
+    "kv_tile": 4,
+    "q_tile": 4
   },
   "torch_dtype": "float32",
+  "transformers_version": "4.53.3",
   "use_pretrained_backbone": true,
   "use_timm_backbone": false
 }

configuration.py CHANGED Viewed

@@ -23,19 +23,19 @@ class LSPDetrConfig(PretrainedConfig):
         dim: int = 384,
         num_heads: int = 12,
         num_classes: int = 1,
-        query_block_size: float = 14.222222222222223,  # 256 / 18
         feature_levels: tuple[int, ...] = (2, 1, 0, 2, 1, 0),
         num_radial_distances: int = 64,
         self_sta_config: STAConfig | None = None,
         cross_sta_config: tuple[STAConfig, ...] = (
-            {"kernel": 5, "q_tile": 3, "kv_tile": 8},
-            {"kernel": 5, "q_tile": 3, "kv_tile": 4},
-            {"kernel": 5, "q_tile": 3, "kv_tile": 2},
         ),
         **kwargs,
     ) -> None:
         if self_sta_config is None:
-            self_sta_config = {"kernel": 3, "q_tile": 3, "kv_tile": 3}
         if backbone_kwargs is None:
             backbone_kwargs = {"out_features": ["stage1", "stage2", "stage3", "stage4"]}

         dim: int = 384,
         num_heads: int = 12,
         num_classes: int = 1,
+        query_block_size: float = 8,  # 256 / 32
         feature_levels: tuple[int, ...] = (2, 1, 0, 2, 1, 0),
         num_radial_distances: int = 64,
         self_sta_config: STAConfig | None = None,
         cross_sta_config: tuple[STAConfig, ...] = (
+            {"kernel": 5, "q_tile": 4, "kv_tile": 8},
+            {"kernel": 5, "q_tile": 4, "kv_tile": 4},
+            {"kernel": 5, "q_tile": 4, "kv_tile": 2},
         ),
         **kwargs,
     ) -> None:
         if self_sta_config is None:
+            self_sta_config = {"kernel": 5, "q_tile": 4, "kv_tile": 4}
         if backbone_kwargs is None:
             backbone_kwargs = {"out_features": ["stage1", "stage2", "stage3", "stage4"]}

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3f5437eb889a864ff88ae121ed7581217778b30430657ce39751a7f5e4b96082
-size 180151024

 version https://git-lfs.github.com/spec/v1
+oid sha256:99b0d385faba4ecb55f9586f57db9c454d41716f5f86614e6110b7f18ec4aca6
+size 180178896

modeling.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import math
-from functools import cached_property, lru_cache
 import torch
 import torch.nn.functional as F
-from einops import rearrange
 from torch import Tensor, nn
 from torch.nn.attention.flex_attention import (
     BlockMask,
@@ -11,7 +11,7 @@ from torch.nn.attention.flex_attention import (
     create_block_mask,
     flex_attention,
 )
-from torch.nn.utils import parametrize
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils.backbone_utils import load_backbone
@@ -21,39 +21,6 @@ from .configuration import LSPDetrConfig, STAConfig
 flex_attention = torch.compile(flex_attention, dynamic=True)
-def init_freqs(head_dim: int, num_heads: int, pos_dim: int, theta: float) -> Tensor:
-    """Taken from https://github.com/naver-ai/rope-vit/blob/main/self-attn/rope_self_attn.py."""
-    freqs_x = []
-    freqs_y = []
-    freqs = 1 / (theta ** (torch.arange(0, head_dim, 2 * pos_dim).float() / head_dim))
-    for _ in range(num_heads):
-        angles = torch.rand(1) * 2 * torch.pi
-        fx = torch.cat(
-            [freqs * torch.cos(angles), freqs * torch.cos(torch.pi / 2 + angles)],
-            dim=-1,
-        )
-        fy = torch.cat(
-            [freqs * torch.sin(angles), freqs * torch.sin(torch.pi / 2 + angles)],
-            dim=-1,
-        )
-        freqs_x.append(fx)
-        freqs_y.append(fy)
-    freqs_x = torch.stack(freqs_x, dim=0)
-    freqs_y = torch.stack(freqs_y, dim=0)
-    return torch.stack([freqs_x, freqs_y], dim=0)
-class Skew(nn.Module):
-    """Skew-symmetric matrix parameterization."""
-    def forward(self, x: Tensor) -> Tensor:
-        a = x.triu(1)
-        return a - a.transpose(-1, -2)
-    def right_inverse(self, x: Tensor) -> Tensor:
-        return x.triu(1)
 class CayleySTRING(nn.Module):
     """Implements the Cayley-STRING positional encoding.
@@ -61,42 +28,20 @@ class CayleySTRING(nn.Module):
     (https://arxiv.org/abs/2502.02562).
     Applies RoPE followed by multiplication with a learnable orthogonal matrix P
-    parameterized by the Cayley transform: P = (I - S)(I + S)^-1, where S is
-    a learnable skew-symmetric matrix.
     Args:
-        dim (int): The feature dimension of the input tensor. Must be even.
-        max_seq_len (int): The maximum sequence length.
-        base (int): The base value for the RoPE frequency calculation. Defaults to 10000.
-        pos_dim (int): The dimensionality of the position vectors (e.g., 1 for 1D, 2 for 2D). Defaults to 1.
     """
-    def __init__(
-        self, dim: int, num_heads: int, pos_dim: int = 2, theta: float = 100.0
-    ) -> None:
         super().__init__()
-        assert dim % num_heads == 0, "Dimension must be divisible by num_heads."
-        head_dim = dim // num_heads
-        self.freqs = nn.Parameter(init_freqs(head_dim, num_heads, pos_dim, theta))
-        self.S = nn.Parameter(torch.zeros(head_dim, head_dim))
-        parametrize.register_parametrization(self, "S", Skew())
-        self.register_buffer("I", torch.eye(head_dim), persistent=False)
-        self.init_weights()
-    def init_weights(self) -> None:
-        self.S = nn.init.kaiming_uniform_(self.S, a=math.sqrt(5))
-    @cached_property
-    def P(self) -> Tensor:
-        i_plus_s_inv = torch.linalg.inv(self.I + self.S)
-        return torch.matmul(self.I - self.S, i_plus_s_inv)
-    @parametrize.cached()
     @torch.autocast("cuda", enabled=False)
     def forward(self, x: Tensor, positions: Tensor) -> Tensor:
         """Apply Cayley-STRING positional encoding.
@@ -105,23 +50,13 @@ class CayleySTRING(nn.Module):
             x ([b, h, n, d]): Input tensor.
             positions ([b, n, pos_dim]): Positions tensor.
         """
-        # Compute (I + S)^-1 @ x
-        if self.training:
-            # Use linalg.solve during training for numerical stability.
-            y = torch.linalg.solve(
-                self.I + self.S, rearrange(x.float(), "b h n d -> (b h) d n")
-            )
-            px = torch.matmul(self.I - self.S, y)
-            px = rearrange(px, "(b h) d n -> b h n d", b=x.size(0))
-        else:
-            # During inference, use the pre-calculated matrix P for performance.
-            px = x.float() @ self.P.T
-        px = px.contiguous()
         # apply RoPE-Mixed
-        angles = torch.einsum("bnk,khc->bhnc", positions, self.freqs)
-        freqs_cis = torch.polar(torch.ones_like(angles), angles)
         px_ = torch.view_as_complex(rearrange(px, "... (d two) -> ... d two", two=2))
         out = rearrange(torch.view_as_real(px_ * freqs_cis), "... d two -> ... (d two)")
@@ -276,7 +211,7 @@ class STAttention(nn.Module):
         self.q_tile = q_tile
         self.kv_tile = kv_tile
-        self.pe = CayleySTRING(dim, num_heads)
         self.q = nn.Linear(dim, dim, bias=False)
         self.kv = nn.Linear(src_dim, dim * 2, bias=False)
         self.wo = nn.Linear(dim, dim, bias=False)
@@ -433,7 +368,7 @@ class LSPTransformer(nn.Module):
     def init_weights(self) -> None:
         prior_prob = 0.01
         bias_value = -math.log((1 - prior_prob) / prior_prob)
-        nn.init.constant_(self.class_head.bias, bias_value)
         # initialize regression layers
         for head in self.point_head:
@@ -518,7 +453,6 @@ class LSPTransformer(nn.Module):
             "absolute_points": relative_to_absolute_pos(
                 ref_points, self.query_block_size, self.query_block_size
             ).flatten(1, 2),
-            "embeddings": tgt.flatten(1, 2),
             "aux_outputs": [
                 {
                     "logits": a,

 import math
+from functools import lru_cache
 import torch
 import torch.nn.functional as F
+from einops import rearrange, repeat
 from torch import Tensor, nn
 from torch.nn.attention.flex_attention import (
     BlockMask,
     create_block_mask,
     flex_attention,
 )
+from torch.nn.utils.parametrizations import orthogonal
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils.backbone_utils import load_backbone
 flex_attention = torch.compile(flex_attention, dynamic=True)
 class CayleySTRING(nn.Module):
     """Implements the Cayley-STRING positional encoding.
     (https://arxiv.org/abs/2502.02562).
     Applies RoPE followed by multiplication with a learnable orthogonal matrix P
+    parameterized by the Cayley transform.
     Args:
+        head_dim (int): The feature dimension of the input tensor. Must be even.
+        pos_dim (int): The dimensionality of the position vectors (e.g., 1 for 1D, 2 for 2D).
+        theta (float): The base value for the RoPE frequency calculation.
     """
+    def __init__(self, dim: int, pos_dim: int = 2, theta: float = 100.0) -> None:
         super().__init__()
+        freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
+        self.freqs = nn.Parameter(repeat(freqs, "d -> p d", p=pos_dim).clone())
+        self.P = orthogonal(nn.Linear(dim, dim, bias=False), orthogonal_map="cayley")
     @torch.autocast("cuda", enabled=False)
     def forward(self, x: Tensor, positions: Tensor) -> Tensor:
         """Apply Cayley-STRING positional encoding.
             x ([b, h, n, d]): Input tensor.
             positions ([b, n, pos_dim]): Positions tensor.
         """
+        px = self.P(x.float())
         # apply RoPE-Mixed
+        freqs = positions @ self.freqs
+        freqs_cis = rearrange(
+            torch.polar(torch.ones_like(freqs), freqs), "b n c -> b 1 n c"
+        )
         px_ = torch.view_as_complex(rearrange(px, "... (d two) -> ... d two", two=2))
         out = rearrange(torch.view_as_real(px_ * freqs_cis), "... d two -> ... (d two)")
         self.q_tile = q_tile
         self.kv_tile = kv_tile
+        self.pe = CayleySTRING(dim // num_heads)
         self.q = nn.Linear(dim, dim, bias=False)
         self.kv = nn.Linear(src_dim, dim * 2, bias=False)
         self.wo = nn.Linear(dim, dim, bias=False)
     def init_weights(self) -> None:
         prior_prob = 0.01
         bias_value = -math.log((1 - prior_prob) / prior_prob)
+        self.class_head.bias.data = torch.ones(self.num_classes) * bias_value
         # initialize regression layers
         for head in self.point_head:
             "absolute_points": relative_to_absolute_pos(
                 ref_points, self.query_block_size, self.query_block_size
             ).flatten(1, 2),
             "aux_outputs": [
                 {
                     "logits": a,