Upload InternVideo2_Classification

Files changed (10) hide show

config.json CHANGED Viewed

@@ -6,9 +6,51 @@
     "AutoConfig": "model_config.VideoChat2Config",
     "AutoModel": "modeling_videochat2_classification.InternVideo2_Classification"
   },
-  "model_config": null,
   "model_type": "InternVideo2_VideoChat2",
-  "torch_dtype": "bfloat16",
   "transformers_version": "4.46.1",
   "use_cache": true
 }

     "AutoConfig": "model_config.VideoChat2Config",
     "AutoModel": "modeling_videochat2_classification.InternVideo2_Classification"
   },
+  "model_config": {
+    "bridge": {
+      "extra_num_query_token": 64,
+      "name": "qformer",
+      "num_query_token": 32,
+      "qformer_attention_probs_dropout_prob": 0.1,
+      "qformer_drop_path_rate": 0.2,
+      "qformer_hidden_dropout_prob": 0.1
+    },
+    "freeze_bridge": false,
+    "freeze_llm": false,
+    "freeze_vision_encoder": false,
+    "llm": {
+      "lora_alpha": 32,
+      "lora_dropout": 0.1,
+      "lora_r": 16,
+      "name": "mistral_7b",
+      "pretrained_llm_path": "mistralai/Mistral-7B-Instruct-v0.3",
+      "use_lora": true
+    },
+    "loss": {
+      "use_vision_regression_loss": false
+    },
+    "pretrained_paths": {},
+    "use_flash_attention": true,
+    "vision_encoder": {
+      "checkpoint_num": 48,
+      "d_model": 1408,
+      "encoder_embed_dim": 1408,
+      "img_size": 224,
+      "name": "internvideo2-1B",
+      "num_frames": 8,
+      "origin_num_frames": 4,
+      "patch_size": 14,
+      "pretrained": null,
+      "sep_image_video_pos_embed": true,
+      "tubelet_size": 1,
+      "use_checkpoint": true,
+      "vit_add_ln": true,
+      "x_vis_only": true,
+      "x_vis_return_idx": -2
+    }
+  },
   "model_type": "InternVideo2_VideoChat2",
+  "torch_dtype": "float32",
   "transformers_version": "4.46.1",
   "use_cache": true
 }

model-00001-of-00007.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd09aaa0f9bf939da238f5a59884e2724a476d8fbf73e79c09a3c89e5391fac7
+size 4897891808

model-00002-of-00007.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:3178ad9789022615e0d8b3038d7c2a48fd7c2dc690dcbe5cef21811ec9af67c5
+size 4962470256

model-00003-of-00007.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9f3cd59785982299fb0a8d05a3924c19250d8f036a41e83367cd5b9aeee1802
+size 4928226584

model-00004-of-00007.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:96507511a2b07c3729e23c5f79ab81ca57bec18760594dc456400cb0947929ed
+size 4794042872

model-00005-of-00007.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:a731dcdca4fc2474027a29dd05470f052d3f68c603317b7b484024d9c50982cd
+size 4860593096

model-00006-of-00007.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc88627c93989ed60f3d38246dbd19431543664b2c5ec6792db35e7b5fc6fb42
+size 4794042872

model-00007-of-00007.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:97a19eaa9479d1c4fdef003bc9f843f4f93ce9d934a2d6f839ae514f514cf030
+size 4109221232

model.safetensors.index.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

modeling_videochat2_classification.py CHANGED Viewed

@@ -10,7 +10,7 @@ from modeling_qformer import build_qformer
 # from .flash_attention_class import FlashAttention
 from model_config import VideoChat2Config
-from transformers import AutoTokenizer,AutoModel, AutoConfig, PreTrainedModel
 import logging
 logger = logging.getLogger(__name__)
@@ -47,10 +47,10 @@ def freeze_module(module):
 class InternVideo2_Classification(PreTrainedModel):
-    config_class = VideoChat2Config
     def __init__(self, config):
         self.model_config = config.model_config
-        config.model_config = None
         super().__init__(config)
         self.build_vision_encoder()
         self.build_llm()

 # from .flash_attention_class import FlashAttention
 from model_config import VideoChat2Config
+from transformers import AutoTokenizer,AutoModel, AutoConfig, PreTrainedModel, PretrainedConfig
 import logging
 logger = logging.getLogger(__name__)
 class InternVideo2_Classification(PreTrainedModel):
+    config_class = PretrainedConfig
     def __init__(self, config):
         self.model_config = config.model_config
+        # config.model_config = None
         super().__init__(config)
         self.build_vision_encoder()
         self.build_llm()