Upload InternVideo2_cls

Browse files

Files changed (10) hide show

config.json +2 -3
model-00001-of-00007.safetensors +3 -0
model-00002-of-00007.safetensors +3 -0
model-00003-of-00007.safetensors +3 -0
model-00004-of-00007.safetensors +3 -0
model-00005-of-00007.safetensors +3 -0
model-00006-of-00007.safetensors +3 -0
model-00007-of-00007.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_videochat2_cls.py +110 -0

config.json CHANGED Viewed

@@ -1,12 +1,11 @@
 {
-  "_attn_implementation_autoset": true,
   "_name_or_path": "OpenGVLab/InternVideo2-Chat-8B",
   "architectures": [
-    "InternVideo2_VideoChat2"
   ],
   "auto_map": {
     "AutoConfig": "model_config.VideoChat2Config",
-    "AutoModel": "OpenGVLab/InternVideo2-Chat-8B--modeling_videochat2.InternVideo2_VideoChat2"
   },
   "model_cls": "InternVideo2_VideoChat2",
   "model_config": null,

 {
   "_name_or_path": "OpenGVLab/InternVideo2-Chat-8B",
   "architectures": [
+    "InternVideo2_cls"
   ],
   "auto_map": {
     "AutoConfig": "model_config.VideoChat2Config",
+    "AutoModel": "modeling_videochat2_cls.InternVideo2_cls"
   },
   "model_cls": "InternVideo2_VideoChat2",
   "model_config": null,

model-00001-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd09aaa0f9bf939da238f5a59884e2724a476d8fbf73e79c09a3c89e5391fac7
+size 4897891808

model-00002-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3178ad9789022615e0d8b3038d7c2a48fd7c2dc690dcbe5cef21811ec9af67c5
+size 4962470256

model-00003-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9f3cd59785982299fb0a8d05a3924c19250d8f036a41e83367cd5b9aeee1802
+size 4928226584

model-00004-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96507511a2b07c3729e23c5f79ab81ca57bec18760594dc456400cb0947929ed
+size 4794042872

model-00005-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a731dcdca4fc2474027a29dd05470f052d3f68c603317b7b484024d9c50982cd
+size 4860593096

model-00006-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc88627c93989ed60f3d38246dbd19431543664b2c5ec6792db35e7b5fc6fb42
+size 4794042872

model-00007-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cdf0c1532cd2fb0d005b2b4829ada1ce05ee074d2e0905f8563a3b4146d70c3d
+size 4109221232

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_videochat2_cls.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import os
+from modeling_videochat2 import *
+from modeling_base import freeze_module
+from transformers import AutoConfig
+token = os.environ['HF_TOKEN']
+class InternVideo2_cls(InternVideo2_VideoChat2):
+    def __init__(self, config):
+        super(InternVideo2_VideoChat2, self).__init__(config=config)
+    def build_llm(self):
+        self.lm_name = self.model_config.llm.name
+        if self.model_config.llm.name == 'mistral_7b':
+            from transformers import AutoModelForSequenceClassification
+            config = AutoConfig.from_pretrained(
+                self.model_config.llm.pretrained_llm_path,
+                torch_dtype=torch.bfloat16,
+                token=token,
+                # attn_implementation="flash_attention_2",
+            )
+            self.lm = AutoModelForSequenceClassification.from_config(config)
+        elif self.model_config.llm.name == 'internlm_20b':
+            from transformers import AutoModelForSequenceClassification
+            self.lm = AutoModelForSequenceClassification.from_pretrained(
+                self.model_config.llm.pretrained_llm_path,
+                torch_dtype=torch.bfloat16,
+                trust_remote_code=True,
+            )
+            self.lm.gradient_checkpointing = True
+            self.lm._set_gradient_checkpointing()
+        elif self.model_config.llm.name == 'internlm2_5_7b':
+            from transformers import AutoModelForSequenceClassification
+            self.lm = AutoModelForSequenceClassification.from_pretrained(
+                self.model_config.llm.pretrained_llm_path,
+                torch_dtype=torch.bfloat16,
+                trust_remote_code=True,
+                local_files_only=True,
+            )
+        else:
+            raise NotImplementedError(self.model_config.llm.name)
+        self.freeze_llm = self.model_config.get("freeze_llm", True)
+        logger.info(f'freeze_llm: {self.freeze_llm}')
+        if self.freeze_llm:
+            logger.info("freeze llm")
+            freeze_module(self.lm)
+        if self.model_config.llm.use_lora:
+            self.use_lora = True
+            from peft import get_peft_model, LoraConfig, TaskType
+            logger.info("Use lora")
+            if self.model_config.llm.name == 'internlm_20b':
+                peft_config = LoraConfig(
+                    task_type=TaskType.CAUSAL_LM, inference_mode=False,
+                    r=self.model_config.llm.lora_r, lora_alpha=self.model_config.llm.lora_alpha, lora_dropout=self.model_config.llm.lora_dropout,
+                    target_modules=['wqkv', 'wo', 'w1', 'w2', 'w3', 'output']
+                )
+            else:
+                peft_config = LoraConfig(
+                    task_type=TaskType.CAUSAL_LM, inference_mode=False,
+                    r=self.model_config.llm.lora_r, lora_alpha=self.model_config.llm.lora_alpha, lora_dropout=self.model_config.llm.lora_dropout,
+                    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                                    "gate_proj", "up_proj", "down_proj", "lm_head"]
+                )
+            self.lm = get_peft_model(self.lm, peft_config)
+            self.lm.enable_input_require_grads()
+            self.lm.print_trainable_parameters()
+        else:
+            self.use_lora = False
+    def build_conversation(self,instruction, user_prompt,media_type='video',msg=''):
+        conversation = ""
+        if instruction:
+            conversation += instruction
+        conversation += ("[INST]" + " ")
+        if media_type == 'image':
+            conversation +=( "<Image>" + IMG_TOKEN + "</Image>")#*ilen
+        else:
+            conversation += ("<Video>" + VID_TOKEN + "</Video>")#*ilen
+        conversation += (msg.rstrip() + "[/INST]")
+        conversation += (" [INST] " + user_prompt + " [/INST]")
+        conversation += ("")
+        return conversation
+if __name__ == "__main__":
+    tokenizer =  AutoTokenizer.from_pretrained('OpenGVLab/InternVideo2-Chat-8B',trust_remote_code=True,use_fast=False)
+    config = AutoConfig.from_pretrained('OpenGVLab/InternVideo2-Chat-8B', torch_dtype=torch.bfloat16,trust_remote_code=True)
+    model = InternVideo2_Classification(config).cuda()
+    B, T, C, H, W = 1, 8, 3, 224, 224
+    video_tensor = torch.randn(B,T,C,H,W).cuda()
+    user_prompt = "this is a user prompt"
+    instruction = "this is an instruction"
+    conversation = model.build_conversation(instruction=instruction, user_prompt=user_prompt, media_type='video')
+    tokenized = model.build_input_ids(tokenizer,conversation,max_length=248,add_special_tokens=True,truncation=False,padding=False,return_tensors='pt')
+    input_ids = tokenized['input_ids'].unsqueeze(0).to(model.device)
+    attn_mask = tokenized['attention_mask'].unsqueeze(0).to(model.device)
+    indexes = tokenized['index'].unsqueeze(0)
+    text_embeds = model.pad_text_embeds(input_ids = input_ids,video = video_tensor,video_idx = indexes)
+    outputs = model.lm(inputs_embeds=text_embeds, attention_mask=attn_mask,output_hidden_states=True,return_dict=True)