OpenGVLab
/

VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B

@@ -636,7 +636,10 @@ class VideoChatFlashQwenForCausalLM(LlavaMetaForCausalLM, Qwen2ForCausalLM_Flash
         image_sizes = [frames[0].shape[:2]]
-        frames = [self.get_vision_tower().image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].to(self.model.dtype).cuda()]
         conv = conv_templates["qwen_2"].copy()
@@ -652,14 +655,20 @@ class VideoChatFlashQwenForCausalLM(LlavaMetaForCausalLM, Qwen2ForCausalLM_Flash
         prompt = conv.get_prompt()
-        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
         if tokenizer.pad_token_id is None:
             if "qwen" in tokenizer.name_or_path.lower():
                 print("Setting pad token to bos token for qwen model.")
                 tokenizer.pad_token_id = 151643
-        attention_masks = input_ids.ne(tokenizer.pad_token_id).long().cuda()
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         keywords = [stop_str]

         image_sizes = [frames[0].shape[:2]]
+        if torch.cuda.is_available():
+            frames = [self.get_vision_tower().image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].to(self.model.dtype).cuda()]
+        else:
+            frames = [self.get_vision_tower().image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].to(self.model.dtype)]
         conv = conv_templates["qwen_2"].copy()
         prompt = conv.get_prompt()
+        if torch.cuda.is_available():
+            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
+        else:
+            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0)
         if tokenizer.pad_token_id is None:
             if "qwen" in tokenizer.name_or_path.lower():
                 print("Setting pad token to bos token for qwen model.")
                 tokenizer.pad_token_id = 151643
+        if torch.cuda.is_available():
+            attention_masks = input_ids.ne(tokenizer.pad_token_id).long().cuda()
+        else:
+            attention_masks = input_ids.ne(tokenizer.pad_token_id).long()
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         keywords = [stop_str]

vision_tower_builder.py CHANGED Viewed

@@ -24,9 +24,11 @@ from transformers.image_utils import (
     to_numpy_array,
 )
-from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
-from flash_attn.bert_padding import unpad_input, pad_input
 class FlashAttention(nn.Module):
     """Implement the scaled dot product attention with softmax.
@@ -729,7 +731,7 @@ class InternVideo2VisionConfig:
         patch_size=14,
         x_vis_return_idx=-2,
         sep_image_video_pos_embed=True,
-        use_checkpoint=True,
         checkpoint_num=40,
         # **kwargs,
     ):
@@ -757,7 +759,7 @@ def build_vit(config, pt_type='origin'):
         drop_path_rate=0.25,
         init_values=0.00001,
         qk_normalization=True,
-        use_flash_attn=True,
         use_fused_rmsnorm=False,
         use_fused_mlp=False,
         fused_mlp_heuristic=1,

     to_numpy_array,
 )
+try:
+    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
+    from flash_attn.bert_padding import unpad_input, pad_input
+except:
+    pass
 class FlashAttention(nn.Module):
     """Implement the scaled dot product attention with softmax.
         patch_size=14,
         x_vis_return_idx=-2,
         sep_image_video_pos_embed=True,
+        use_checkpoint=False,
         checkpoint_num=40,
         # **kwargs,
     ):
         drop_path_rate=0.25,
         init_values=0.00001,
         qk_normalization=True,
+        use_flash_attn=torch.cuda.is_available(),
         use_fused_rmsnorm=False,
         use_fused_mlp=False,
         fused_mlp_heuristic=1,