Rihong commited on
Commit
6e80a24
·
verified ·
1 Parent(s): 5cf25a4

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. Qformer.py +2 -1
  2. README.md +58 -0
  3. blip2.py +1 -1
  4. videochat2_it_hd_mistral.py +1 -1
Qformer.py CHANGED
@@ -31,6 +31,7 @@ from transformers.modeling_utils import (
31
  # find_pruneable_heads_and_indices,
32
  # prune_linear_layer,
33
  )
 
34
  from transformers.pytorch_utils import (
35
  # PreTrainedModel,
36
  apply_chunking_to_forward,
@@ -1021,7 +1022,7 @@ class BertModel(BertPreTrainedModel):
1021
  )
1022
 
1023
 
1024
- class BertLMHeadModel(BertPreTrainedModel):
1025
 
1026
  _keys_to_ignore_on_load_unexpected = [r"pooler"]
1027
  _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
 
31
  # find_pruneable_heads_and_indices,
32
  # prune_linear_layer,
33
  )
34
+ from transformers import GenerationMixin
35
  from transformers.pytorch_utils import (
36
  # PreTrainedModel,
37
  apply_chunking_to_forward,
 
1022
  )
1023
 
1024
 
1025
+ class BertLMHeadModel(BertPreTrainedModel, GenerationMixin):
1026
 
1027
  _keys_to_ignore_on_load_unexpected = [r"pooler"]
1028
  _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Infinite-VideoChat2 (infty_videochat2)
2
+
3
+ HuggingFace-compatible model files for **VideoChat2-Infinity** (Mistral-7B), a long video understanding model that extends [VideoChat2-HD](https://github.com/OpenGVLab/Ask-Anything/tree/main/video_chat2) with infinite-length video processing via continuous long-term attention.
4
+
5
+ ## Model Architecture
6
+
7
+ - **Vision Encoder** – UMT-L (ViT, 1024-dim, 24 layers)
8
+ - **Q-Former** – bridges visual features to the LLM with 32 + 64 query tokens
9
+ - **LLM** – [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
10
+ - **Long-Term Attention** – Gibbs-sampling-based continuous attention over basis functions for unbounded temporal context
11
+
12
+ ## File Overview
13
+
14
+ | File | Description |
15
+ |------|-------------|
16
+ | `config.json` | HuggingFace `AutoConfig` configuration |
17
+ | `configuration_videochat2.py` | Custom `PretrainedConfig` subclass (`Config`) |
18
+ | `videochat2_it_hd_mistral.py` | Main model class (`VideoChat2_it_hd_mistral`) |
19
+ | `blip2.py` | BLIP-2 base class for vision-language bridging |
20
+ | `vit.py` | Vision Transformer (UMT-L) implementation |
21
+ | `Qformer.py` | Q-Former module |
22
+ | `basis_functions.py` | Basis functions (Power, Sine, Cosine, Gaussian, Rectangular) for long-term attention |
23
+ | `long_term_attention_gibbs.py` | Long-term attention with Gibbs sampling |
24
+ | `model-*.safetensors` | Model weights (sharded, 4 parts) |
25
+ | `.gitignore` | Ignores model weight files |
26
+
27
+ ## Usage
28
+
29
+ The model is registered with HuggingFace `auto_map`, so it can be loaded directly:
30
+
31
+ ```python
32
+ from transformers import AutoConfig, AutoModel
33
+
34
+ config = AutoConfig.from_pretrained("Rihong/VideoChat2_Infinity_Mistral_7B_hf", trust_remote_code=True)
35
+ model = AutoModel.from_pretrained("Rihong/VideoChat2_Infinity_Mistral_7B_hf", trust_remote_code=True)
36
+ ```
37
+
38
+ To upload model to HuggingFace, just run:
39
+
40
+ ```bash
41
+ hf upload Rihong/VideoChat2_Infinity_Mistral_7B_hf ./lmms_eval/baselines/infty_videochat2/
42
+ ```
43
+
44
+ ## Key Hyperparameters
45
+
46
+ | Parameter | Default | Description |
47
+ |-----------|---------|-------------|
48
+ | `num_basis` | 256 | Number of basis functions for long-term attention |
49
+ | `tau` | 0.75 | Temperature for Gibbs sampling |
50
+ | `alpha` | 0.75 | Mixing coefficient |
51
+ | `sticky` | `true` | Enable sticky memories |
52
+ | `hd_num` | 6 | Number of high-definition crops |
53
+ | `local_size` | 224 | Local crop resolution |
54
+
55
+ ## References
56
+
57
+ - [VideoChat2 (Ask-Anything)](https://github.com/OpenGVLab/Ask-Anything/tree/main/video_chat2)
58
+ - [Infinite-Video](https://github.com/deep-spin/Infinite-Video)
blip2.py CHANGED
@@ -43,7 +43,7 @@ class Blip2Base(PreTrainedModel):
43
  enable_autocast = self.device != torch.device("cpu")
44
 
45
  if enable_autocast:
46
- return torch.cuda.amp.autocast(dtype=dtype)
47
  else:
48
  return contextlib.nullcontext()
49
 
 
43
  enable_autocast = self.device != torch.device("cpu")
44
 
45
  if enable_autocast:
46
+ return torch.amp.autocast('cuda', dtype=dtype)
47
  else:
48
  return contextlib.nullcontext()
49
 
videochat2_it_hd_mistral.py CHANGED
@@ -95,7 +95,7 @@ class VideoChat2_it_hd_mistral(Blip2Base):
95
  layer.output = None
96
  layer.intermediate = None
97
  else:
98
- self.qformer.resize_token_embeddings(len(self.tokenizer))
99
  self.qformer.cls = None
100
 
101
  if vit_blip_model_path:
 
95
  layer.output = None
96
  layer.intermediate = None
97
  else:
98
+ self.qformer.resize_token_embeddings(len(self.tokenizer), mean_resizing=False)
99
  self.qformer.cls = None
100
 
101
  if vit_blip_model_path: