tencent
/

Youtu-VL-4B-Instruct

Image-Text-to-Text

text-generation

Model card Files Files and versions

yuboz commited on Feb 10

Commit

e75957d

·

verified ·

1 Parent(s): 1d8e509

add support to SDPA attention

Files changed (1) hide show

modeling_siglip2.py +30 -0

modeling_siglip2.py CHANGED Viewed

@@ -505,7 +505,37 @@ class Vision_EagerAttention(nn.Module):
         return attn_output, None
 VISION_ATTENTION_CLASSES = {
     'eager': Vision_EagerAttention,
     'flash_attention_2': Vision_FlashAttention2,
 }

         return attn_output, None
+class Vision_SDPAAttention(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        dim, heads = config.hidden_size, config.num_attention_heads
+        self.num_heads, self.head_dim = heads, dim // heads
+        self.k_proj, self.v_proj, self.q_proj, self.out_proj = [nn.Linear(dim, dim) for _ in range(4)]
+        self.dropout = getattr(config, "attention_dropout", 0.0)
+    def forward(self, hidden_states, cu_seqlens, rotary_pos_emb=None, position_embeddings=None):
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.q_proj(hidden_states).view(seq_length, self.num_heads, self.head_dim), self.k_proj(hidden_states).view(seq_length, self.num_heads, self.head_dim), self.v_proj(hidden_states).view(seq_length, self.num_heads, self.head_dim)
+        if position_embeddings is None:
+            emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        else:
+            cos, sin = position_embeddings
+        q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
+        attention_mask = torch.full([1, 1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype)
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i-1]:cu_seqlens[i], cu_seqlens[i-1]:cu_seqlens[i]] = 0
+        q = q.transpose(0, 1).unsqueeze(0)
+        k = k.transpose(0, 1).unsqueeze(0)
+        v = v.transpose(0, 1).unsqueeze(0)
+        attn_output = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attention_mask)
+        return self.out_proj(attn_output.squeeze(0).transpose(0, 1).reshape(seq_length, -1).to(hidden_states.dtype)), None
 VISION_ATTENTION_CLASSES = {
+    'sdpa': Vision_SDPAAttention,
     'eager': Vision_EagerAttention,
     'flash_attention_2': Vision_FlashAttention2,
 }