alexnasa
/

sam2_C_cpu

Model card Files Files and versions

xet

Community

alexnasa commited on Nov 10, 2025

Commit

8267b13

verified ·

1 Parent(s): bcc5314

Update sam2/modeling/sam/transformer.py

Browse files

Files changed (1) hide show

sam2/modeling/sam/transformer.py +33 -39

sam2/modeling/sam/transformer.py CHANGED Viewed

@@ -288,50 +288,44 @@ class RoPEAttention(Attention):
         self.freqs_cis = freqs_cis
         self.rope_k_repeat = rope_k_repeat
-    def forward(
-        self, q: Tensor, k: Tensor, v: Tensor, num_k_exclude_rope: int = 0
-    ) -> Tensor:
-        # Input projections
         q = self.q_proj(q)
         k = self.k_proj(k)
         v = self.v_proj(v)
-        # # Separate into heads
-        # q = self._separate_heads(q, self.num_heads)
-        # k = self._separate_heads(k, self.num_heads)
-        # v = self._separate_heads(v, self.num_heads)
-        q = rearrange(q, "b s (n d) -> b s n d", n=self.num_heads)
-        k = rearrange(k, "b s (n d) -> b s n d", n=self.num_heads)
-        v = rearrange(v, "b s (n d) -> b s n d", n=self.num_heads)
-        # Apply rotary position encoding
-        w = h = math.sqrt(q.shape[-2])
-        self.freqs_cis = self.freqs_cis.to(q.device)
-        if self.freqs_cis.shape[0] != q.shape[-2]:
-            self.freqs_cis = self.compute_cis(end_x=w, end_y=h).to(q.device)
-        if q.shape[-2] != k.shape[-2]:
             assert self.rope_k_repeat
-        num_k_rope = k.size(-2) - num_k_exclude_rope
-        q, k[:, :, :num_k_rope] = apply_rotary_enc(
-            q,
-            k[:, :, :num_k_rope],
             freqs_cis=self.freqs_cis,
             repeat_freqs_k=self.rope_k_repeat,
         )
-        dropout_p = self.dropout_p if self.training else 0.0
-        # #with torch.nn.attention.sdpa_kernel(get_sdp_backends(dropout_p)):
-        # out = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p)
-        # out = self._recombine_heads(out)
-        out = flash_attn_interface.flash_attn_func(q, k, v)  # -> [b, s_q, n, d]
-        out = rearrange(out, "b s n d -> b s (n d)", n=self.num_heads)
-        out = self.out_proj(out)
-        return out

         self.freqs_cis = freqs_cis
         self.rope_k_repeat = rope_k_repeat
+    def forward(self, q: Tensor, k: Tensor, v: Tensor, num_k_exclude_rope: int = 0) -> Tensor:
         q = self.q_proj(q)
         k = self.k_proj(k)
         v = self.v_proj(v)
+        # 1) reshape to (B, H, S, D) so RoPE sees the sequence at dim -2
+        q_hsd = rearrange(q, "b s (h d) -> b h s d", h=self.num_heads)
+        k_hsd = rearrange(k, "b s (h d) -> b h s d", h=self.num_heads)
+        v_hsd = rearrange(v, "b s (h d) -> b h s d", h=self.num_heads)
+        # 2) RoPE expects S at -2
+        S = q_hsd.shape[-2]
+        w = h = math.sqrt(S)
+        self.freqs_cis = self.freqs_cis.to(q_hsd.device)
+        if self.freqs_cis.shape[0] != S:
+            self.freqs_cis = self.compute_cis(end_x=w, end_y=h).to(q_hsd.device)
+        if q_hsd.shape[-2] != k_hsd.shape[-2]:
             assert self.rope_k_repeat
+        num_k_rope = k_hsd.size(-2) - num_k_exclude_rope
+        q_hsd, k_hsd[:, :, :num_k_rope] = apply_rotary_enc(
+            q_hsd,
+            k_hsd[:, :, :num_k_rope],
             freqs_cis=self.freqs_cis,
             repeat_freqs_k=self.rope_k_repeat,
         )
+        # 3) switch to (B, S, H, D) for FlashAttention
+        q_bshd = rearrange(q_hsd, "b h s d -> b s h d")
+        k_bshd = rearrange(k_hsd, "b h s d -> b s h d")
+        v_bshd = rearrange(v_hsd, "b h s d -> b s h d")
+        out = flash_attn_interface.flash_attn_func(
+            q_bshd, k_bshd, v_bshd,
+            dropout_p=self.dropout_p if self.training else 0.0
+        )  # (B, S, H, D)
+        out = rearrange(out, "b s h d -> b s (h d)")
+        return self.out_proj(out)