Rihong
/

VideoChat2_HD_Infinity_Mistral_7B

Video-Text-to-Text

feature-extraction

Model card Files Files and versions

Rihong commited on Mar 3

Commit

c4f85ed

·

verified ·

1 Parent(s): 56a2e58

Upload folder using huggingface_hub

Files changed (2) hide show

README.md +1 -1
vit.py +12 -5

README.md CHANGED Viewed

@@ -63,4 +63,4 @@ hf upload Rihong/VideoChat2_Infinity_Mistral_7B_hf ./lmms_eval/baselines/infty_v
 ## References
 - [VideoChat2 (Ask-Anything)](https://github.com/OpenGVLab/Ask-Anything/tree/main/video_chat2)
-- [Infinite-Video](https://github.com/deep-spin/Infinite-Video)

 ## References
 - [VideoChat2 (Ask-Anything)](https://github.com/OpenGVLab/Ask-Anything/tree/main/video_chat2)
+- [Infinite-Video](https://github.com/deep-spin/Infinite-Video)

vit.py CHANGED Viewed

@@ -88,13 +88,20 @@ class Attention(nn.Module):
         qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
         q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
-        q = q * self.scale
-        attn = (q @ k.transpose(-2, -1))
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
-        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
         x = self.proj(x)
         x = self.proj_drop(x)
         return x

         qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
         q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+        # memory inefficient attention implementation that has been replaced by F.scaled_dot_product_attention
+        # q = q * self.scale
+        # attn = (q @ k.transpose(-2, -1))
+        # attn = attn.softmax(dim=-1)
+        # attn = self.attn_drop(attn)
+        # x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        # Use F.scaled_dot_product_attention for memory-efficient attention (flash attention)
+        dropout_p = self.attn_drop.p if self.training else 0.0
+        x = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p, scale=self.scale)
+        x = x.transpose(1, 2).reshape(B, N, -1)
         x = self.proj(x)
         x = self.proj_drop(x)
         return x