alexnasa
/

sam2_C_cpu

Model card Files Files and versions

xet

Community

alex commited on Nov 10, 2025

Commit

a8172af

1 Parent(s): c4d567b

use flash

Browse files

Files changed (1) hide show

sam2/modeling/sam/transformer.py +60 -44

sam2/modeling/sam/transformer.py CHANGED Viewed

@@ -23,6 +23,13 @@ from einops import rearrange
 warnings.simplefilter(action="ignore", category=FutureWarning)
 # OLD_GPU, USE_FLASH_ATTN, MATH_KERNEL_ON = get_sdpa_settings()
 class TwoWayTransformer(nn.Module):
     def __init__(
@@ -241,7 +248,22 @@ class Attention(nn.Module):
         k = self.k_proj(k)
         v = self.v_proj(v)
-        if q.device == "cpu":
             # Separate into heads
             q = self._separate_heads(q, self.num_heads)
             k = self._separate_heads(k, self.num_heads)
@@ -255,17 +277,6 @@ class Attention(nn.Module):
             out = self._recombine_heads(out)
-        else:
-            q = rearrange(q, "b s (n d) -> b s n d", n=self.num_heads)
-            k = rearrange(k, "b s (n d) -> b s n d", n=self.num_heads)
-            v = rearrange(v, "b s (n d) -> b s n d", n=self.num_heads)
-            out = flash_attn_interface.flash_attn_func(q, k, v)  # -> [b, s_q, n, d]
-            out = rearrange(out, "b s n d -> b s (n d)", n=self.num_heads)
-            out = self.out_proj(out)
         return out
@@ -299,40 +310,10 @@ class RoPEAttention(Attention):
         k = self.k_proj(k)
         v = self.v_proj(v)
-        if q.device == "cpu":
-            # Separate into heads
-            q = self._separate_heads(q, self.num_heads)
-            k = self._separate_heads(k, self.num_heads)
-            v = self._separate_heads(v, self.num_heads)
-            # Apply rotary position encoding
-            w = h = math.sqrt(q.shape[-2])
-            self.freqs_cis = self.freqs_cis.to(q.device)
-            if self.freqs_cis.shape[0] != q.shape[-2]:
-                self.freqs_cis = self.compute_cis(end_x=w, end_y=h).to(q.device)
-            if q.shape[-2] != k.shape[-2]:
-                assert self.rope_k_repeat
-            num_k_rope = k.size(-2) - num_k_exclude_rope
-            q, k[:, :, :num_k_rope] = apply_rotary_enc(
-                q,
-                k[:, :, :num_k_rope],
-                freqs_cis=self.freqs_cis,
-                repeat_freqs_k=self.rope_k_repeat,
-            )
-            dropout_p = self.dropout_p if self.training else 0.0
-            #with torch.nn.attention.sdpa_kernel(get_sdp_backends(dropout_p)):
-            out = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p)
-            out = self._recombine_heads(out)
-            out = self.out_proj(out)
-            return out
-        else:
             # 1) reshape to (B, H, S, D) so RoPE sees the sequence at dim -2
             q_hsd = rearrange(q, "b s (h d) -> b h s d", h=self.num_heads)
             k_hsd = rearrange(k, "b s (h d) -> b h s d", h=self.num_heads)
@@ -365,4 +346,39 @@ class RoPEAttention(Attention):
             )  # (B, S, H, D)
             out = rearrange(out, "b s h d -> b s (h d)")
             return self.out_proj(out)

 warnings.simplefilter(action="ignore", category=FutureWarning)
 # OLD_GPU, USE_FLASH_ATTN, MATH_KERNEL_ON = get_sdpa_settings()
+def _can_use_flash_attn(q: torch.Tensor) -> bool:
+    # FlashAttention works on CUDA with fp16/bf16 and (usually) Ampere+ GPUs
+    if not q.is_cuda:
+        return False
+    major, _ = torch.cuda.get_device_capability(q.device)
+    return q.dtype in (torch.float16, torch.bfloat16) and major >= 8  # A100/RTX30+ typically
 class TwoWayTransformer(nn.Module):
     def __init__(
         k = self.k_proj(k)
         v = self.v_proj(v)
+        use_flash = _can_use_flash_attn(q)
+        if use_flash:
+            q = rearrange(q, "b s (n d) -> b s n d", n=self.num_heads)
+            k = rearrange(k, "b s (n d) -> b s n d", n=self.num_heads)
+            v = rearrange(v, "b s (n d) -> b s n d", n=self.num_heads)
+            out = flash_attn_interface.flash_attn_func(q, k, v)  # -> [b, s_q, n, d]
+            out = rearrange(out, "b s n d -> b s (n d)", n=self.num_heads)
+            out = self.out_proj(out)
+        else:
             # Separate into heads
             q = self._separate_heads(q, self.num_heads)
             k = self._separate_heads(k, self.num_heads)
             out = self._recombine_heads(out)
         return out
         k = self.k_proj(k)
         v = self.v_proj(v)
+        use_flash = _can_use_flash_attn(q)
+        if use_flash:
             # 1) reshape to (B, H, S, D) so RoPE sees the sequence at dim -2
             q_hsd = rearrange(q, "b s (h d) -> b h s d", h=self.num_heads)
             k_hsd = rearrange(k, "b s (h d) -> b h s d", h=self.num_heads)
             )  # (B, S, H, D)
             out = rearrange(out, "b s h d -> b s (h d)")
             return self.out_proj(out)
+        else:
+            # Separate into heads
+            q = self._separate_heads(q, self.num_heads)
+            k = self._separate_heads(k, self.num_heads)
+            v = self._separate_heads(v, self.num_heads)
+            # Apply rotary position encoding
+            w = h = math.sqrt(q.shape[-2])
+            self.freqs_cis = self.freqs_cis.to(q.device)
+            if self.freqs_cis.shape[0] != q.shape[-2]:
+                self.freqs_cis = self.compute_cis(end_x=w, end_y=h).to(q.device)
+            if q.shape[-2] != k.shape[-2]:
+                assert self.rope_k_repeat
+            num_k_rope = k.size(-2) - num_k_exclude_rope
+            q, k[:, :, :num_k_rope] = apply_rotary_enc(
+                q,
+                k[:, :, :num_k_rope],
+                freqs_cis=self.freqs_cis,
+                repeat_freqs_k=self.rope_k_repeat,
+            )
+            dropout_p = self.dropout_p if self.training else 0.0
+            #with torch.nn.attention.sdpa_kernel(get_sdp_backends(dropout_p)):
+            out = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p)
+            out = self._recombine_heads(out)
+            out = self.out_proj(out)
+            return out