Upload folder using huggingface_hub
Browse files- Qformer.py +2 -1
- README.md +58 -0
- blip2.py +1 -1
- videochat2_it_hd_mistral.py +1 -1
Qformer.py
CHANGED
|
@@ -31,6 +31,7 @@ from transformers.modeling_utils import (
|
|
| 31 |
# find_pruneable_heads_and_indices,
|
| 32 |
# prune_linear_layer,
|
| 33 |
)
|
|
|
|
| 34 |
from transformers.pytorch_utils import (
|
| 35 |
# PreTrainedModel,
|
| 36 |
apply_chunking_to_forward,
|
|
@@ -1021,7 +1022,7 @@ class BertModel(BertPreTrainedModel):
|
|
| 1021 |
)
|
| 1022 |
|
| 1023 |
|
| 1024 |
-
class BertLMHeadModel(BertPreTrainedModel):
|
| 1025 |
|
| 1026 |
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
| 1027 |
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
|
|
|
|
| 31 |
# find_pruneable_heads_and_indices,
|
| 32 |
# prune_linear_layer,
|
| 33 |
)
|
| 34 |
+
from transformers import GenerationMixin
|
| 35 |
from transformers.pytorch_utils import (
|
| 36 |
# PreTrainedModel,
|
| 37 |
apply_chunking_to_forward,
|
|
|
|
| 1022 |
)
|
| 1023 |
|
| 1024 |
|
| 1025 |
+
class BertLMHeadModel(BertPreTrainedModel, GenerationMixin):
|
| 1026 |
|
| 1027 |
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
| 1028 |
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
|
README.md
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Infinite-VideoChat2 (infty_videochat2)
|
| 2 |
+
|
| 3 |
+
HuggingFace-compatible model files for **VideoChat2-Infinity** (Mistral-7B), a long video understanding model that extends [VideoChat2-HD](https://github.com/OpenGVLab/Ask-Anything/tree/main/video_chat2) with infinite-length video processing via continuous long-term attention.
|
| 4 |
+
|
| 5 |
+
## Model Architecture
|
| 6 |
+
|
| 7 |
+
- **Vision Encoder** – UMT-L (ViT, 1024-dim, 24 layers)
|
| 8 |
+
- **Q-Former** – bridges visual features to the LLM with 32 + 64 query tokens
|
| 9 |
+
- **LLM** – [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
|
| 10 |
+
- **Long-Term Attention** – Gibbs-sampling-based continuous attention over basis functions for unbounded temporal context
|
| 11 |
+
|
| 12 |
+
## File Overview
|
| 13 |
+
|
| 14 |
+
| File | Description |
|
| 15 |
+
|------|-------------|
|
| 16 |
+
| `config.json` | HuggingFace `AutoConfig` configuration |
|
| 17 |
+
| `configuration_videochat2.py` | Custom `PretrainedConfig` subclass (`Config`) |
|
| 18 |
+
| `videochat2_it_hd_mistral.py` | Main model class (`VideoChat2_it_hd_mistral`) |
|
| 19 |
+
| `blip2.py` | BLIP-2 base class for vision-language bridging |
|
| 20 |
+
| `vit.py` | Vision Transformer (UMT-L) implementation |
|
| 21 |
+
| `Qformer.py` | Q-Former module |
|
| 22 |
+
| `basis_functions.py` | Basis functions (Power, Sine, Cosine, Gaussian, Rectangular) for long-term attention |
|
| 23 |
+
| `long_term_attention_gibbs.py` | Long-term attention with Gibbs sampling |
|
| 24 |
+
| `model-*.safetensors` | Model weights (sharded, 4 parts) |
|
| 25 |
+
| `.gitignore` | Ignores model weight files |
|
| 26 |
+
|
| 27 |
+
## Usage
|
| 28 |
+
|
| 29 |
+
The model is registered with HuggingFace `auto_map`, so it can be loaded directly:
|
| 30 |
+
|
| 31 |
+
```python
|
| 32 |
+
from transformers import AutoConfig, AutoModel
|
| 33 |
+
|
| 34 |
+
config = AutoConfig.from_pretrained("Rihong/VideoChat2_Infinity_Mistral_7B_hf", trust_remote_code=True)
|
| 35 |
+
model = AutoModel.from_pretrained("Rihong/VideoChat2_Infinity_Mistral_7B_hf", trust_remote_code=True)
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
To upload model to HuggingFace, just run:
|
| 39 |
+
|
| 40 |
+
```bash
|
| 41 |
+
hf upload Rihong/VideoChat2_Infinity_Mistral_7B_hf ./lmms_eval/baselines/infty_videochat2/
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
## Key Hyperparameters
|
| 45 |
+
|
| 46 |
+
| Parameter | Default | Description |
|
| 47 |
+
|-----------|---------|-------------|
|
| 48 |
+
| `num_basis` | 256 | Number of basis functions for long-term attention |
|
| 49 |
+
| `tau` | 0.75 | Temperature for Gibbs sampling |
|
| 50 |
+
| `alpha` | 0.75 | Mixing coefficient |
|
| 51 |
+
| `sticky` | `true` | Enable sticky memories |
|
| 52 |
+
| `hd_num` | 6 | Number of high-definition crops |
|
| 53 |
+
| `local_size` | 224 | Local crop resolution |
|
| 54 |
+
|
| 55 |
+
## References
|
| 56 |
+
|
| 57 |
+
- [VideoChat2 (Ask-Anything)](https://github.com/OpenGVLab/Ask-Anything/tree/main/video_chat2)
|
| 58 |
+
- [Infinite-Video](https://github.com/deep-spin/Infinite-Video)
|
blip2.py
CHANGED
|
@@ -43,7 +43,7 @@ class Blip2Base(PreTrainedModel):
|
|
| 43 |
enable_autocast = self.device != torch.device("cpu")
|
| 44 |
|
| 45 |
if enable_autocast:
|
| 46 |
-
return torch.
|
| 47 |
else:
|
| 48 |
return contextlib.nullcontext()
|
| 49 |
|
|
|
|
| 43 |
enable_autocast = self.device != torch.device("cpu")
|
| 44 |
|
| 45 |
if enable_autocast:
|
| 46 |
+
return torch.amp.autocast('cuda', dtype=dtype)
|
| 47 |
else:
|
| 48 |
return contextlib.nullcontext()
|
| 49 |
|
videochat2_it_hd_mistral.py
CHANGED
|
@@ -95,7 +95,7 @@ class VideoChat2_it_hd_mistral(Blip2Base):
|
|
| 95 |
layer.output = None
|
| 96 |
layer.intermediate = None
|
| 97 |
else:
|
| 98 |
-
self.qformer.resize_token_embeddings(len(self.tokenizer))
|
| 99 |
self.qformer.cls = None
|
| 100 |
|
| 101 |
if vit_blip_model_path:
|
|
|
|
| 95 |
layer.output = None
|
| 96 |
layer.intermediate = None
|
| 97 |
else:
|
| 98 |
+
self.qformer.resize_token_embeddings(len(self.tokenizer), mean_resizing=False)
|
| 99 |
self.qformer.cls = None
|
| 100 |
|
| 101 |
if vit_blip_model_path:
|