Upload model
Browse files- config.json +9 -9
- configuration.py +5 -5
- model.safetensors +2 -2
- modeling.py +18 -84
config.json
CHANGED
|
@@ -20,17 +20,17 @@
|
|
| 20 |
{
|
| 21 |
"kernel": 5,
|
| 22 |
"kv_tile": 8,
|
| 23 |
-
"q_tile":
|
| 24 |
},
|
| 25 |
{
|
| 26 |
"kernel": 5,
|
| 27 |
"kv_tile": 4,
|
| 28 |
-
"q_tile":
|
| 29 |
},
|
| 30 |
{
|
| 31 |
"kernel": 5,
|
| 32 |
"kv_tile": 2,
|
| 33 |
-
"q_tile":
|
| 34 |
}
|
| 35 |
],
|
| 36 |
"dim": 384,
|
|
@@ -43,17 +43,17 @@
|
|
| 43 |
0
|
| 44 |
],
|
| 45 |
"model_type": "lsp_detr",
|
| 46 |
-
"num_classes":
|
| 47 |
"num_heads": 12,
|
| 48 |
"num_radial_distances": 64,
|
| 49 |
-
"query_block_size":
|
| 50 |
"self_sta_config": {
|
| 51 |
-
"kernel":
|
| 52 |
-
"kv_tile":
|
| 53 |
-
"q_tile":
|
| 54 |
},
|
| 55 |
"torch_dtype": "float32",
|
| 56 |
-
"transformers_version": "4.
|
| 57 |
"use_pretrained_backbone": true,
|
| 58 |
"use_timm_backbone": false
|
| 59 |
}
|
|
|
|
| 20 |
{
|
| 21 |
"kernel": 5,
|
| 22 |
"kv_tile": 8,
|
| 23 |
+
"q_tile": 4
|
| 24 |
},
|
| 25 |
{
|
| 26 |
"kernel": 5,
|
| 27 |
"kv_tile": 4,
|
| 28 |
+
"q_tile": 4
|
| 29 |
},
|
| 30 |
{
|
| 31 |
"kernel": 5,
|
| 32 |
"kv_tile": 2,
|
| 33 |
+
"q_tile": 4
|
| 34 |
}
|
| 35 |
],
|
| 36 |
"dim": 384,
|
|
|
|
| 43 |
0
|
| 44 |
],
|
| 45 |
"model_type": "lsp_detr",
|
| 46 |
+
"num_classes": 1,
|
| 47 |
"num_heads": 12,
|
| 48 |
"num_radial_distances": 64,
|
| 49 |
+
"query_block_size": 8,
|
| 50 |
"self_sta_config": {
|
| 51 |
+
"kernel": 5,
|
| 52 |
+
"kv_tile": 4,
|
| 53 |
+
"q_tile": 4
|
| 54 |
},
|
| 55 |
"torch_dtype": "float32",
|
| 56 |
+
"transformers_version": "4.53.3",
|
| 57 |
"use_pretrained_backbone": true,
|
| 58 |
"use_timm_backbone": false
|
| 59 |
}
|
configuration.py
CHANGED
|
@@ -23,19 +23,19 @@ class LSPDetrConfig(PretrainedConfig):
|
|
| 23 |
dim: int = 384,
|
| 24 |
num_heads: int = 12,
|
| 25 |
num_classes: int = 1,
|
| 26 |
-
query_block_size: float =
|
| 27 |
feature_levels: tuple[int, ...] = (2, 1, 0, 2, 1, 0),
|
| 28 |
num_radial_distances: int = 64,
|
| 29 |
self_sta_config: STAConfig | None = None,
|
| 30 |
cross_sta_config: tuple[STAConfig, ...] = (
|
| 31 |
-
{"kernel": 5, "q_tile":
|
| 32 |
-
{"kernel": 5, "q_tile":
|
| 33 |
-
{"kernel": 5, "q_tile":
|
| 34 |
),
|
| 35 |
**kwargs,
|
| 36 |
) -> None:
|
| 37 |
if self_sta_config is None:
|
| 38 |
-
self_sta_config = {"kernel":
|
| 39 |
|
| 40 |
if backbone_kwargs is None:
|
| 41 |
backbone_kwargs = {"out_features": ["stage1", "stage2", "stage3", "stage4"]}
|
|
|
|
| 23 |
dim: int = 384,
|
| 24 |
num_heads: int = 12,
|
| 25 |
num_classes: int = 1,
|
| 26 |
+
query_block_size: float = 8, # 256 / 32
|
| 27 |
feature_levels: tuple[int, ...] = (2, 1, 0, 2, 1, 0),
|
| 28 |
num_radial_distances: int = 64,
|
| 29 |
self_sta_config: STAConfig | None = None,
|
| 30 |
cross_sta_config: tuple[STAConfig, ...] = (
|
| 31 |
+
{"kernel": 5, "q_tile": 4, "kv_tile": 8},
|
| 32 |
+
{"kernel": 5, "q_tile": 4, "kv_tile": 4},
|
| 33 |
+
{"kernel": 5, "q_tile": 4, "kv_tile": 2},
|
| 34 |
),
|
| 35 |
**kwargs,
|
| 36 |
) -> None:
|
| 37 |
if self_sta_config is None:
|
| 38 |
+
self_sta_config = {"kernel": 5, "q_tile": 4, "kv_tile": 4}
|
| 39 |
|
| 40 |
if backbone_kwargs is None:
|
| 41 |
backbone_kwargs = {"out_features": ["stage1", "stage2", "stage3", "stage4"]}
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:99b0d385faba4ecb55f9586f57db9c454d41716f5f86614e6110b7f18ec4aca6
|
| 3 |
+
size 180178896
|
modeling.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import math
|
| 2 |
-
from functools import
|
| 3 |
|
| 4 |
import torch
|
| 5 |
import torch.nn.functional as F
|
| 6 |
-
from einops import rearrange
|
| 7 |
from torch import Tensor, nn
|
| 8 |
from torch.nn.attention.flex_attention import (
|
| 9 |
BlockMask,
|
|
@@ -11,7 +11,7 @@ from torch.nn.attention.flex_attention import (
|
|
| 11 |
create_block_mask,
|
| 12 |
flex_attention,
|
| 13 |
)
|
| 14 |
-
from torch.nn.utils import
|
| 15 |
from transformers.modeling_utils import PreTrainedModel
|
| 16 |
from transformers.utils.backbone_utils import load_backbone
|
| 17 |
|
|
@@ -21,39 +21,6 @@ from .configuration import LSPDetrConfig, STAConfig
|
|
| 21 |
flex_attention = torch.compile(flex_attention, dynamic=True)
|
| 22 |
|
| 23 |
|
| 24 |
-
def init_freqs(head_dim: int, num_heads: int, pos_dim: int, theta: float) -> Tensor:
|
| 25 |
-
"""Taken from https://github.com/naver-ai/rope-vit/blob/main/self-attn/rope_self_attn.py."""
|
| 26 |
-
freqs_x = []
|
| 27 |
-
freqs_y = []
|
| 28 |
-
freqs = 1 / (theta ** (torch.arange(0, head_dim, 2 * pos_dim).float() / head_dim))
|
| 29 |
-
for _ in range(num_heads):
|
| 30 |
-
angles = torch.rand(1) * 2 * torch.pi
|
| 31 |
-
fx = torch.cat(
|
| 32 |
-
[freqs * torch.cos(angles), freqs * torch.cos(torch.pi / 2 + angles)],
|
| 33 |
-
dim=-1,
|
| 34 |
-
)
|
| 35 |
-
fy = torch.cat(
|
| 36 |
-
[freqs * torch.sin(angles), freqs * torch.sin(torch.pi / 2 + angles)],
|
| 37 |
-
dim=-1,
|
| 38 |
-
)
|
| 39 |
-
freqs_x.append(fx)
|
| 40 |
-
freqs_y.append(fy)
|
| 41 |
-
freqs_x = torch.stack(freqs_x, dim=0)
|
| 42 |
-
freqs_y = torch.stack(freqs_y, dim=0)
|
| 43 |
-
return torch.stack([freqs_x, freqs_y], dim=0)
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
class Skew(nn.Module):
|
| 47 |
-
"""Skew-symmetric matrix parameterization."""
|
| 48 |
-
|
| 49 |
-
def forward(self, x: Tensor) -> Tensor:
|
| 50 |
-
a = x.triu(1)
|
| 51 |
-
return a - a.transpose(-1, -2)
|
| 52 |
-
|
| 53 |
-
def right_inverse(self, x: Tensor) -> Tensor:
|
| 54 |
-
return x.triu(1)
|
| 55 |
-
|
| 56 |
-
|
| 57 |
class CayleySTRING(nn.Module):
|
| 58 |
"""Implements the Cayley-STRING positional encoding.
|
| 59 |
|
|
@@ -61,42 +28,20 @@ class CayleySTRING(nn.Module):
|
|
| 61 |
(https://arxiv.org/abs/2502.02562).
|
| 62 |
|
| 63 |
Applies RoPE followed by multiplication with a learnable orthogonal matrix P
|
| 64 |
-
parameterized by the Cayley transform
|
| 65 |
-
a learnable skew-symmetric matrix.
|
| 66 |
|
| 67 |
Args:
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
pos_dim (int): The dimensionality of the position vectors (e.g., 1 for 1D, 2 for 2D). Defaults to 1.
|
| 72 |
"""
|
| 73 |
|
| 74 |
-
def __init__(
|
| 75 |
-
self, dim: int, num_heads: int, pos_dim: int = 2, theta: float = 100.0
|
| 76 |
-
) -> None:
|
| 77 |
super().__init__()
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
|
| 82 |
-
self.freqs = nn.Parameter(init_freqs(head_dim, num_heads, pos_dim, theta))
|
| 83 |
-
|
| 84 |
-
self.S = nn.Parameter(torch.zeros(head_dim, head_dim))
|
| 85 |
-
parametrize.register_parametrization(self, "S", Skew())
|
| 86 |
-
|
| 87 |
-
self.register_buffer("I", torch.eye(head_dim), persistent=False)
|
| 88 |
-
|
| 89 |
-
self.init_weights()
|
| 90 |
-
|
| 91 |
-
def init_weights(self) -> None:
|
| 92 |
-
self.S = nn.init.kaiming_uniform_(self.S, a=math.sqrt(5))
|
| 93 |
-
|
| 94 |
-
@cached_property
|
| 95 |
-
def P(self) -> Tensor:
|
| 96 |
-
i_plus_s_inv = torch.linalg.inv(self.I + self.S)
|
| 97 |
-
return torch.matmul(self.I - self.S, i_plus_s_inv)
|
| 98 |
-
|
| 99 |
-
@parametrize.cached()
|
| 100 |
@torch.autocast("cuda", enabled=False)
|
| 101 |
def forward(self, x: Tensor, positions: Tensor) -> Tensor:
|
| 102 |
"""Apply Cayley-STRING positional encoding.
|
|
@@ -105,23 +50,13 @@ class CayleySTRING(nn.Module):
|
|
| 105 |
x ([b, h, n, d]): Input tensor.
|
| 106 |
positions ([b, n, pos_dim]): Positions tensor.
|
| 107 |
"""
|
| 108 |
-
|
| 109 |
-
if self.training:
|
| 110 |
-
# Use linalg.solve during training for numerical stability.
|
| 111 |
-
y = torch.linalg.solve(
|
| 112 |
-
self.I + self.S, rearrange(x.float(), "b h n d -> (b h) d n")
|
| 113 |
-
)
|
| 114 |
-
px = torch.matmul(self.I - self.S, y)
|
| 115 |
-
px = rearrange(px, "(b h) d n -> b h n d", b=x.size(0))
|
| 116 |
-
else:
|
| 117 |
-
# During inference, use the pre-calculated matrix P for performance.
|
| 118 |
-
px = x.float() @ self.P.T
|
| 119 |
-
|
| 120 |
-
px = px.contiguous()
|
| 121 |
|
| 122 |
# apply RoPE-Mixed
|
| 123 |
-
|
| 124 |
-
freqs_cis =
|
|
|
|
|
|
|
| 125 |
px_ = torch.view_as_complex(rearrange(px, "... (d two) -> ... d two", two=2))
|
| 126 |
out = rearrange(torch.view_as_real(px_ * freqs_cis), "... d two -> ... (d two)")
|
| 127 |
|
|
@@ -276,7 +211,7 @@ class STAttention(nn.Module):
|
|
| 276 |
self.q_tile = q_tile
|
| 277 |
self.kv_tile = kv_tile
|
| 278 |
|
| 279 |
-
self.pe = CayleySTRING(dim
|
| 280 |
self.q = nn.Linear(dim, dim, bias=False)
|
| 281 |
self.kv = nn.Linear(src_dim, dim * 2, bias=False)
|
| 282 |
self.wo = nn.Linear(dim, dim, bias=False)
|
|
@@ -433,7 +368,7 @@ class LSPTransformer(nn.Module):
|
|
| 433 |
def init_weights(self) -> None:
|
| 434 |
prior_prob = 0.01
|
| 435 |
bias_value = -math.log((1 - prior_prob) / prior_prob)
|
| 436 |
-
|
| 437 |
|
| 438 |
# initialize regression layers
|
| 439 |
for head in self.point_head:
|
|
@@ -518,7 +453,6 @@ class LSPTransformer(nn.Module):
|
|
| 518 |
"absolute_points": relative_to_absolute_pos(
|
| 519 |
ref_points, self.query_block_size, self.query_block_size
|
| 520 |
).flatten(1, 2),
|
| 521 |
-
"embeddings": tgt.flatten(1, 2),
|
| 522 |
"aux_outputs": [
|
| 523 |
{
|
| 524 |
"logits": a,
|
|
|
|
| 1 |
import math
|
| 2 |
+
from functools import lru_cache
|
| 3 |
|
| 4 |
import torch
|
| 5 |
import torch.nn.functional as F
|
| 6 |
+
from einops import rearrange, repeat
|
| 7 |
from torch import Tensor, nn
|
| 8 |
from torch.nn.attention.flex_attention import (
|
| 9 |
BlockMask,
|
|
|
|
| 11 |
create_block_mask,
|
| 12 |
flex_attention,
|
| 13 |
)
|
| 14 |
+
from torch.nn.utils.parametrizations import orthogonal
|
| 15 |
from transformers.modeling_utils import PreTrainedModel
|
| 16 |
from transformers.utils.backbone_utils import load_backbone
|
| 17 |
|
|
|
|
| 21 |
flex_attention = torch.compile(flex_attention, dynamic=True)
|
| 22 |
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
class CayleySTRING(nn.Module):
|
| 25 |
"""Implements the Cayley-STRING positional encoding.
|
| 26 |
|
|
|
|
| 28 |
(https://arxiv.org/abs/2502.02562).
|
| 29 |
|
| 30 |
Applies RoPE followed by multiplication with a learnable orthogonal matrix P
|
| 31 |
+
parameterized by the Cayley transform.
|
|
|
|
| 32 |
|
| 33 |
Args:
|
| 34 |
+
head_dim (int): The feature dimension of the input tensor. Must be even.
|
| 35 |
+
pos_dim (int): The dimensionality of the position vectors (e.g., 1 for 1D, 2 for 2D).
|
| 36 |
+
theta (float): The base value for the RoPE frequency calculation.
|
|
|
|
| 37 |
"""
|
| 38 |
|
| 39 |
+
def __init__(self, dim: int, pos_dim: int = 2, theta: float = 100.0) -> None:
|
|
|
|
|
|
|
| 40 |
super().__init__()
|
| 41 |
+
freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
|
| 42 |
+
self.freqs = nn.Parameter(repeat(freqs, "d -> p d", p=pos_dim).clone())
|
| 43 |
+
self.P = orthogonal(nn.Linear(dim, dim, bias=False), orthogonal_map="cayley")
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
@torch.autocast("cuda", enabled=False)
|
| 46 |
def forward(self, x: Tensor, positions: Tensor) -> Tensor:
|
| 47 |
"""Apply Cayley-STRING positional encoding.
|
|
|
|
| 50 |
x ([b, h, n, d]): Input tensor.
|
| 51 |
positions ([b, n, pos_dim]): Positions tensor.
|
| 52 |
"""
|
| 53 |
+
px = self.P(x.float())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
# apply RoPE-Mixed
|
| 56 |
+
freqs = positions @ self.freqs
|
| 57 |
+
freqs_cis = rearrange(
|
| 58 |
+
torch.polar(torch.ones_like(freqs), freqs), "b n c -> b 1 n c"
|
| 59 |
+
)
|
| 60 |
px_ = torch.view_as_complex(rearrange(px, "... (d two) -> ... d two", two=2))
|
| 61 |
out = rearrange(torch.view_as_real(px_ * freqs_cis), "... d two -> ... (d two)")
|
| 62 |
|
|
|
|
| 211 |
self.q_tile = q_tile
|
| 212 |
self.kv_tile = kv_tile
|
| 213 |
|
| 214 |
+
self.pe = CayleySTRING(dim // num_heads)
|
| 215 |
self.q = nn.Linear(dim, dim, bias=False)
|
| 216 |
self.kv = nn.Linear(src_dim, dim * 2, bias=False)
|
| 217 |
self.wo = nn.Linear(dim, dim, bias=False)
|
|
|
|
| 368 |
def init_weights(self) -> None:
|
| 369 |
prior_prob = 0.01
|
| 370 |
bias_value = -math.log((1 - prior_prob) / prior_prob)
|
| 371 |
+
self.class_head.bias.data = torch.ones(self.num_classes) * bias_value
|
| 372 |
|
| 373 |
# initialize regression layers
|
| 374 |
for head in self.point_head:
|
|
|
|
| 453 |
"absolute_points": relative_to_absolute_pos(
|
| 454 |
ref_points, self.query_block_size, self.query_block_size
|
| 455 |
).flatten(1, 2),
|
|
|
|
| 456 |
"aux_outputs": [
|
| 457 |
{
|
| 458 |
"logits": a,
|