morpheushoc commited on
Commit
1b5cb00
·
verified ·
1 Parent(s): 95f05fb

Upload InternVideo2_Classification

Browse files
config.json CHANGED
@@ -6,9 +6,51 @@
6
  "AutoConfig": "model_config.VideoChat2Config",
7
  "AutoModel": "modeling_videochat2_classification.InternVideo2_Classification"
8
  },
9
- "model_config": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  "model_type": "InternVideo2_VideoChat2",
11
- "torch_dtype": "bfloat16",
12
  "transformers_version": "4.46.1",
13
  "use_cache": true
14
  }
 
6
  "AutoConfig": "model_config.VideoChat2Config",
7
  "AutoModel": "modeling_videochat2_classification.InternVideo2_Classification"
8
  },
9
+ "model_config": {
10
+ "bridge": {
11
+ "extra_num_query_token": 64,
12
+ "name": "qformer",
13
+ "num_query_token": 32,
14
+ "qformer_attention_probs_dropout_prob": 0.1,
15
+ "qformer_drop_path_rate": 0.2,
16
+ "qformer_hidden_dropout_prob": 0.1
17
+ },
18
+ "freeze_bridge": false,
19
+ "freeze_llm": false,
20
+ "freeze_vision_encoder": false,
21
+ "llm": {
22
+ "lora_alpha": 32,
23
+ "lora_dropout": 0.1,
24
+ "lora_r": 16,
25
+ "name": "mistral_7b",
26
+ "pretrained_llm_path": "mistralai/Mistral-7B-Instruct-v0.3",
27
+ "use_lora": true
28
+ },
29
+ "loss": {
30
+ "use_vision_regression_loss": false
31
+ },
32
+ "pretrained_paths": {},
33
+ "use_flash_attention": true,
34
+ "vision_encoder": {
35
+ "checkpoint_num": 48,
36
+ "d_model": 1408,
37
+ "encoder_embed_dim": 1408,
38
+ "img_size": 224,
39
+ "name": "internvideo2-1B",
40
+ "num_frames": 8,
41
+ "origin_num_frames": 4,
42
+ "patch_size": 14,
43
+ "pretrained": null,
44
+ "sep_image_video_pos_embed": true,
45
+ "tubelet_size": 1,
46
+ "use_checkpoint": true,
47
+ "vit_add_ln": true,
48
+ "x_vis_only": true,
49
+ "x_vis_return_idx": -2
50
+ }
51
+ },
52
  "model_type": "InternVideo2_VideoChat2",
53
+ "torch_dtype": "float32",
54
  "transformers_version": "4.46.1",
55
  "use_cache": true
56
  }
model-00001-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd09aaa0f9bf939da238f5a59884e2724a476d8fbf73e79c09a3c89e5391fac7
3
+ size 4897891808
model-00002-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3178ad9789022615e0d8b3038d7c2a48fd7c2dc690dcbe5cef21811ec9af67c5
3
+ size 4962470256
model-00003-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9f3cd59785982299fb0a8d05a3924c19250d8f036a41e83367cd5b9aeee1802
3
+ size 4928226584
model-00004-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96507511a2b07c3729e23c5f79ab81ca57bec18760594dc456400cb0947929ed
3
+ size 4794042872
model-00005-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a731dcdca4fc2474027a29dd05470f052d3f68c603317b7b484024d9c50982cd
3
+ size 4860593096
model-00006-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc88627c93989ed60f3d38246dbd19431543664b2c5ec6792db35e7b5fc6fb42
3
+ size 4794042872
model-00007-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97a19eaa9479d1c4fdef003bc9f843f4f93ce9d934a2d6f839ae514f514cf030
3
+ size 4109221232
model.safetensors.index.json CHANGED
The diff for this file is too large to render. See raw diff
 
modeling_videochat2_classification.py CHANGED
@@ -10,7 +10,7 @@ from modeling_qformer import build_qformer
10
  # from .flash_attention_class import FlashAttention
11
  from model_config import VideoChat2Config
12
 
13
- from transformers import AutoTokenizer,AutoModel, AutoConfig, PreTrainedModel
14
  import logging
15
  logger = logging.getLogger(__name__)
16
 
@@ -47,10 +47,10 @@ def freeze_module(module):
47
 
48
 
49
  class InternVideo2_Classification(PreTrainedModel):
50
- config_class = VideoChat2Config
51
  def __init__(self, config):
52
  self.model_config = config.model_config
53
- config.model_config = None
54
  super().__init__(config)
55
  self.build_vision_encoder()
56
  self.build_llm()
 
10
  # from .flash_attention_class import FlashAttention
11
  from model_config import VideoChat2Config
12
 
13
+ from transformers import AutoTokenizer,AutoModel, AutoConfig, PreTrainedModel, PretrainedConfig
14
  import logging
15
  logger = logging.getLogger(__name__)
16
 
 
47
 
48
 
49
  class InternVideo2_Classification(PreTrainedModel):
50
+ config_class = PretrainedConfig
51
  def __init__(self, config):
52
  self.model_config = config.model_config
53
+ # config.model_config = None
54
  super().__init__(config)
55
  self.build_vision_encoder()
56
  self.build_llm()