Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

Qformer.py +2 -1
README.md +58 -0
blip2.py +1 -1
videochat2_it_hd_mistral.py +1 -1

Qformer.py CHANGED Viewed

@@ -31,6 +31,7 @@ from transformers.modeling_utils import (
     # find_pruneable_heads_and_indices,
     # prune_linear_layer,
 )
 from transformers.pytorch_utils import (
     # PreTrainedModel,
     apply_chunking_to_forward,
@@ -1021,7 +1022,7 @@ class BertModel(BertPreTrainedModel):
         )
-class BertLMHeadModel(BertPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]

     # find_pruneable_heads_and_indices,
     # prune_linear_layer,
 )
+from transformers import GenerationMixin
 from transformers.pytorch_utils import (
     # PreTrainedModel,
     apply_chunking_to_forward,
         )
+class BertLMHeadModel(BertPreTrainedModel, GenerationMixin):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]

README.md ADDED Viewed

	@@ -0,0 +1,58 @@

+# Infinite-VideoChat2 (infty_videochat2)
+HuggingFace-compatible model files for **VideoChat2-Infinity** (Mistral-7B), a long video understanding model that extends [VideoChat2-HD](https://github.com/OpenGVLab/Ask-Anything/tree/main/video_chat2) with infinite-length video processing via continuous long-term attention.
+## Model Architecture
+- **Vision Encoder** – UMT-L (ViT, 1024-dim, 24 layers)
+- **Q-Former** – bridges visual features to the LLM with 32 + 64 query tokens
+- **LLM** – [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
+- **Long-Term Attention** – Gibbs-sampling-based continuous attention over basis functions for unbounded temporal context
+## File Overview
+| File | Description |
+|------|-------------|
+| `config.json` | HuggingFace `AutoConfig` configuration |
+| `configuration_videochat2.py` | Custom `PretrainedConfig` subclass (`Config`) |
+| `videochat2_it_hd_mistral.py` | Main model class (`VideoChat2_it_hd_mistral`) |
+| `blip2.py` | BLIP-2 base class for vision-language bridging |
+| `vit.py` | Vision Transformer (UMT-L) implementation |
+| `Qformer.py` | Q-Former module |
+| `basis_functions.py` | Basis functions (Power, Sine, Cosine, Gaussian, Rectangular) for long-term attention |
+| `long_term_attention_gibbs.py` | Long-term attention with Gibbs sampling |
+| `model-*.safetensors` | Model weights (sharded, 4 parts) |
+| `.gitignore` | Ignores model weight files |
+## Usage
+The model is registered with HuggingFace `auto_map`, so it can be loaded directly:
+```python
+from transformers import AutoConfig, AutoModel
+config = AutoConfig.from_pretrained("Rihong/VideoChat2_Infinity_Mistral_7B_hf", trust_remote_code=True)
+model = AutoModel.from_pretrained("Rihong/VideoChat2_Infinity_Mistral_7B_hf", trust_remote_code=True)
+```
+To upload model to HuggingFace, just run:
+```bash
+hf upload Rihong/VideoChat2_Infinity_Mistral_7B_hf ./lmms_eval/baselines/infty_videochat2/
+```
+## Key Hyperparameters
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `num_basis` | 256 | Number of basis functions for long-term attention |
+| `tau` | 0.75 | Temperature for Gibbs sampling |
+| `alpha` | 0.75 | Mixing coefficient |
+| `sticky` | `true` | Enable sticky memories |
+| `hd_num` | 6 | Number of high-definition crops |
+| `local_size` | 224 | Local crop resolution |
+## References
+- [VideoChat2 (Ask-Anything)](https://github.com/OpenGVLab/Ask-Anything/tree/main/video_chat2)
+- [Infinite-Video](https://github.com/deep-spin/Infinite-Video)

blip2.py CHANGED Viewed

@@ -43,7 +43,7 @@ class Blip2Base(PreTrainedModel):
         enable_autocast = self.device != torch.device("cpu")
         if enable_autocast:
-            return torch.cuda.amp.autocast(dtype=dtype)
         else:
             return contextlib.nullcontext()

         enable_autocast = self.device != torch.device("cpu")
         if enable_autocast:
+            return torch.amp.autocast('cuda', dtype=dtype)
         else:
             return contextlib.nullcontext()

videochat2_it_hd_mistral.py CHANGED Viewed

@@ -95,7 +95,7 @@ class VideoChat2_it_hd_mistral(Blip2Base):
                 layer.output = None
                 layer.intermediate = None
         else:
-            self.qformer.resize_token_embeddings(len(self.tokenizer))
         self.qformer.cls = None
         if vit_blip_model_path:

                 layer.output = None
                 layer.intermediate = None
         else:
+            self.qformer.resize_token_embeddings(len(self.tokenizer), mean_resizing=False)
         self.qformer.cls = None
         if vit_blip_model_path: