matejpekar commited on
Commit
45f73f8
·
verified ·
1 Parent(s): a321761

Upload model

Browse files
Files changed (4) hide show
  1. config.json +9 -9
  2. configuration.py +5 -5
  3. model.safetensors +2 -2
  4. modeling.py +18 -84
config.json CHANGED
@@ -20,17 +20,17 @@
20
  {
21
  "kernel": 5,
22
  "kv_tile": 8,
23
- "q_tile": 3
24
  },
25
  {
26
  "kernel": 5,
27
  "kv_tile": 4,
28
- "q_tile": 3
29
  },
30
  {
31
  "kernel": 5,
32
  "kv_tile": 2,
33
- "q_tile": 3
34
  }
35
  ],
36
  "dim": 384,
@@ -43,17 +43,17 @@
43
  0
44
  ],
45
  "model_type": "lsp_detr",
46
- "num_classes": 5,
47
  "num_heads": 12,
48
  "num_radial_distances": 64,
49
- "query_block_size": 14.222222222222223,
50
  "self_sta_config": {
51
- "kernel": 3,
52
- "kv_tile": 3,
53
- "q_tile": 3
54
  },
55
  "torch_dtype": "float32",
56
- "transformers_version": "4.52.3",
57
  "use_pretrained_backbone": true,
58
  "use_timm_backbone": false
59
  }
 
20
  {
21
  "kernel": 5,
22
  "kv_tile": 8,
23
+ "q_tile": 4
24
  },
25
  {
26
  "kernel": 5,
27
  "kv_tile": 4,
28
+ "q_tile": 4
29
  },
30
  {
31
  "kernel": 5,
32
  "kv_tile": 2,
33
+ "q_tile": 4
34
  }
35
  ],
36
  "dim": 384,
 
43
  0
44
  ],
45
  "model_type": "lsp_detr",
46
+ "num_classes": 1,
47
  "num_heads": 12,
48
  "num_radial_distances": 64,
49
+ "query_block_size": 8,
50
  "self_sta_config": {
51
+ "kernel": 5,
52
+ "kv_tile": 4,
53
+ "q_tile": 4
54
  },
55
  "torch_dtype": "float32",
56
+ "transformers_version": "4.53.3",
57
  "use_pretrained_backbone": true,
58
  "use_timm_backbone": false
59
  }
configuration.py CHANGED
@@ -23,19 +23,19 @@ class LSPDetrConfig(PretrainedConfig):
23
  dim: int = 384,
24
  num_heads: int = 12,
25
  num_classes: int = 1,
26
- query_block_size: float = 14.222222222222223, # 256 / 18
27
  feature_levels: tuple[int, ...] = (2, 1, 0, 2, 1, 0),
28
  num_radial_distances: int = 64,
29
  self_sta_config: STAConfig | None = None,
30
  cross_sta_config: tuple[STAConfig, ...] = (
31
- {"kernel": 5, "q_tile": 3, "kv_tile": 8},
32
- {"kernel": 5, "q_tile": 3, "kv_tile": 4},
33
- {"kernel": 5, "q_tile": 3, "kv_tile": 2},
34
  ),
35
  **kwargs,
36
  ) -> None:
37
  if self_sta_config is None:
38
- self_sta_config = {"kernel": 3, "q_tile": 3, "kv_tile": 3}
39
 
40
  if backbone_kwargs is None:
41
  backbone_kwargs = {"out_features": ["stage1", "stage2", "stage3", "stage4"]}
 
23
  dim: int = 384,
24
  num_heads: int = 12,
25
  num_classes: int = 1,
26
+ query_block_size: float = 8, # 256 / 32
27
  feature_levels: tuple[int, ...] = (2, 1, 0, 2, 1, 0),
28
  num_radial_distances: int = 64,
29
  self_sta_config: STAConfig | None = None,
30
  cross_sta_config: tuple[STAConfig, ...] = (
31
+ {"kernel": 5, "q_tile": 4, "kv_tile": 8},
32
+ {"kernel": 5, "q_tile": 4, "kv_tile": 4},
33
+ {"kernel": 5, "q_tile": 4, "kv_tile": 2},
34
  ),
35
  **kwargs,
36
  ) -> None:
37
  if self_sta_config is None:
38
+ self_sta_config = {"kernel": 5, "q_tile": 4, "kv_tile": 4}
39
 
40
  if backbone_kwargs is None:
41
  backbone_kwargs = {"out_features": ["stage1", "stage2", "stage3", "stage4"]}
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f5437eb889a864ff88ae121ed7581217778b30430657ce39751a7f5e4b96082
3
- size 180151024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99b0d385faba4ecb55f9586f57db9c454d41716f5f86614e6110b7f18ec4aca6
3
+ size 180178896
modeling.py CHANGED
@@ -1,9 +1,9 @@
1
  import math
2
- from functools import cached_property, lru_cache
3
 
4
  import torch
5
  import torch.nn.functional as F
6
- from einops import rearrange
7
  from torch import Tensor, nn
8
  from torch.nn.attention.flex_attention import (
9
  BlockMask,
@@ -11,7 +11,7 @@ from torch.nn.attention.flex_attention import (
11
  create_block_mask,
12
  flex_attention,
13
  )
14
- from torch.nn.utils import parametrize
15
  from transformers.modeling_utils import PreTrainedModel
16
  from transformers.utils.backbone_utils import load_backbone
17
 
@@ -21,39 +21,6 @@ from .configuration import LSPDetrConfig, STAConfig
21
  flex_attention = torch.compile(flex_attention, dynamic=True)
22
 
23
 
24
- def init_freqs(head_dim: int, num_heads: int, pos_dim: int, theta: float) -> Tensor:
25
- """Taken from https://github.com/naver-ai/rope-vit/blob/main/self-attn/rope_self_attn.py."""
26
- freqs_x = []
27
- freqs_y = []
28
- freqs = 1 / (theta ** (torch.arange(0, head_dim, 2 * pos_dim).float() / head_dim))
29
- for _ in range(num_heads):
30
- angles = torch.rand(1) * 2 * torch.pi
31
- fx = torch.cat(
32
- [freqs * torch.cos(angles), freqs * torch.cos(torch.pi / 2 + angles)],
33
- dim=-1,
34
- )
35
- fy = torch.cat(
36
- [freqs * torch.sin(angles), freqs * torch.sin(torch.pi / 2 + angles)],
37
- dim=-1,
38
- )
39
- freqs_x.append(fx)
40
- freqs_y.append(fy)
41
- freqs_x = torch.stack(freqs_x, dim=0)
42
- freqs_y = torch.stack(freqs_y, dim=0)
43
- return torch.stack([freqs_x, freqs_y], dim=0)
44
-
45
-
46
- class Skew(nn.Module):
47
- """Skew-symmetric matrix parameterization."""
48
-
49
- def forward(self, x: Tensor) -> Tensor:
50
- a = x.triu(1)
51
- return a - a.transpose(-1, -2)
52
-
53
- def right_inverse(self, x: Tensor) -> Tensor:
54
- return x.triu(1)
55
-
56
-
57
  class CayleySTRING(nn.Module):
58
  """Implements the Cayley-STRING positional encoding.
59
 
@@ -61,42 +28,20 @@ class CayleySTRING(nn.Module):
61
  (https://arxiv.org/abs/2502.02562).
62
 
63
  Applies RoPE followed by multiplication with a learnable orthogonal matrix P
64
- parameterized by the Cayley transform: P = (I - S)(I + S)^-1, where S is
65
- a learnable skew-symmetric matrix.
66
 
67
  Args:
68
- dim (int): The feature dimension of the input tensor. Must be even.
69
- max_seq_len (int): The maximum sequence length.
70
- base (int): The base value for the RoPE frequency calculation. Defaults to 10000.
71
- pos_dim (int): The dimensionality of the position vectors (e.g., 1 for 1D, 2 for 2D). Defaults to 1.
72
  """
73
 
74
- def __init__(
75
- self, dim: int, num_heads: int, pos_dim: int = 2, theta: float = 100.0
76
- ) -> None:
77
  super().__init__()
78
- assert dim % num_heads == 0, "Dimension must be divisible by num_heads."
79
-
80
- head_dim = dim // num_heads
81
 
82
- self.freqs = nn.Parameter(init_freqs(head_dim, num_heads, pos_dim, theta))
83
-
84
- self.S = nn.Parameter(torch.zeros(head_dim, head_dim))
85
- parametrize.register_parametrization(self, "S", Skew())
86
-
87
- self.register_buffer("I", torch.eye(head_dim), persistent=False)
88
-
89
- self.init_weights()
90
-
91
- def init_weights(self) -> None:
92
- self.S = nn.init.kaiming_uniform_(self.S, a=math.sqrt(5))
93
-
94
- @cached_property
95
- def P(self) -> Tensor:
96
- i_plus_s_inv = torch.linalg.inv(self.I + self.S)
97
- return torch.matmul(self.I - self.S, i_plus_s_inv)
98
-
99
- @parametrize.cached()
100
  @torch.autocast("cuda", enabled=False)
101
  def forward(self, x: Tensor, positions: Tensor) -> Tensor:
102
  """Apply Cayley-STRING positional encoding.
@@ -105,23 +50,13 @@ class CayleySTRING(nn.Module):
105
  x ([b, h, n, d]): Input tensor.
106
  positions ([b, n, pos_dim]): Positions tensor.
107
  """
108
- # Compute (I + S)^-1 @ x
109
- if self.training:
110
- # Use linalg.solve during training for numerical stability.
111
- y = torch.linalg.solve(
112
- self.I + self.S, rearrange(x.float(), "b h n d -> (b h) d n")
113
- )
114
- px = torch.matmul(self.I - self.S, y)
115
- px = rearrange(px, "(b h) d n -> b h n d", b=x.size(0))
116
- else:
117
- # During inference, use the pre-calculated matrix P for performance.
118
- px = x.float() @ self.P.T
119
-
120
- px = px.contiguous()
121
 
122
  # apply RoPE-Mixed
123
- angles = torch.einsum("bnk,khc->bhnc", positions, self.freqs)
124
- freqs_cis = torch.polar(torch.ones_like(angles), angles)
 
 
125
  px_ = torch.view_as_complex(rearrange(px, "... (d two) -> ... d two", two=2))
126
  out = rearrange(torch.view_as_real(px_ * freqs_cis), "... d two -> ... (d two)")
127
 
@@ -276,7 +211,7 @@ class STAttention(nn.Module):
276
  self.q_tile = q_tile
277
  self.kv_tile = kv_tile
278
 
279
- self.pe = CayleySTRING(dim, num_heads)
280
  self.q = nn.Linear(dim, dim, bias=False)
281
  self.kv = nn.Linear(src_dim, dim * 2, bias=False)
282
  self.wo = nn.Linear(dim, dim, bias=False)
@@ -433,7 +368,7 @@ class LSPTransformer(nn.Module):
433
  def init_weights(self) -> None:
434
  prior_prob = 0.01
435
  bias_value = -math.log((1 - prior_prob) / prior_prob)
436
- nn.init.constant_(self.class_head.bias, bias_value)
437
 
438
  # initialize regression layers
439
  for head in self.point_head:
@@ -518,7 +453,6 @@ class LSPTransformer(nn.Module):
518
  "absolute_points": relative_to_absolute_pos(
519
  ref_points, self.query_block_size, self.query_block_size
520
  ).flatten(1, 2),
521
- "embeddings": tgt.flatten(1, 2),
522
  "aux_outputs": [
523
  {
524
  "logits": a,
 
1
  import math
2
+ from functools import lru_cache
3
 
4
  import torch
5
  import torch.nn.functional as F
6
+ from einops import rearrange, repeat
7
  from torch import Tensor, nn
8
  from torch.nn.attention.flex_attention import (
9
  BlockMask,
 
11
  create_block_mask,
12
  flex_attention,
13
  )
14
+ from torch.nn.utils.parametrizations import orthogonal
15
  from transformers.modeling_utils import PreTrainedModel
16
  from transformers.utils.backbone_utils import load_backbone
17
 
 
21
  flex_attention = torch.compile(flex_attention, dynamic=True)
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  class CayleySTRING(nn.Module):
25
  """Implements the Cayley-STRING positional encoding.
26
 
 
28
  (https://arxiv.org/abs/2502.02562).
29
 
30
  Applies RoPE followed by multiplication with a learnable orthogonal matrix P
31
+ parameterized by the Cayley transform.
 
32
 
33
  Args:
34
+ head_dim (int): The feature dimension of the input tensor. Must be even.
35
+ pos_dim (int): The dimensionality of the position vectors (e.g., 1 for 1D, 2 for 2D).
36
+ theta (float): The base value for the RoPE frequency calculation.
 
37
  """
38
 
39
+ def __init__(self, dim: int, pos_dim: int = 2, theta: float = 100.0) -> None:
 
 
40
  super().__init__()
41
+ freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
42
+ self.freqs = nn.Parameter(repeat(freqs, "d -> p d", p=pos_dim).clone())
43
+ self.P = orthogonal(nn.Linear(dim, dim, bias=False), orthogonal_map="cayley")
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  @torch.autocast("cuda", enabled=False)
46
  def forward(self, x: Tensor, positions: Tensor) -> Tensor:
47
  """Apply Cayley-STRING positional encoding.
 
50
  x ([b, h, n, d]): Input tensor.
51
  positions ([b, n, pos_dim]): Positions tensor.
52
  """
53
+ px = self.P(x.float())
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  # apply RoPE-Mixed
56
+ freqs = positions @ self.freqs
57
+ freqs_cis = rearrange(
58
+ torch.polar(torch.ones_like(freqs), freqs), "b n c -> b 1 n c"
59
+ )
60
  px_ = torch.view_as_complex(rearrange(px, "... (d two) -> ... d two", two=2))
61
  out = rearrange(torch.view_as_real(px_ * freqs_cis), "... d two -> ... (d two)")
62
 
 
211
  self.q_tile = q_tile
212
  self.kv_tile = kv_tile
213
 
214
+ self.pe = CayleySTRING(dim // num_heads)
215
  self.q = nn.Linear(dim, dim, bias=False)
216
  self.kv = nn.Linear(src_dim, dim * 2, bias=False)
217
  self.wo = nn.Linear(dim, dim, bias=False)
 
368
  def init_weights(self) -> None:
369
  prior_prob = 0.01
370
  bias_value = -math.log((1 - prior_prob) / prior_prob)
371
+ self.class_head.bias.data = torch.ones(self.num_classes) * bias_value
372
 
373
  # initialize regression layers
374
  for head in self.point_head:
 
453
  "absolute_points": relative_to_absolute_pos(
454
  ref_points, self.query_block_size, self.query_block_size
455
  ).flatten(1, 2),
 
456
  "aux_outputs": [
457
  {
458
  "logits": a,