first commit

Files changed (13) hide show

config.json +19 -0
configuration_videoccam.py +38 -0
llm_adapter/README.md +202 -0
llm_adapter/adapter_config.json +31 -0
llm_adapter/adapter_model.safetensors +3 -0
modeling_videoccam.py +215 -0
projector/config.json +26 -0
projector/configuration_ccam.py +51 -0
projector/model.safetensors +3 -0
projector/modeling_ccam.py +196 -0
vision_encoder_adapter/README.md +202 -0
vision_encoder_adapter/adapter_config.json +36 -0
vision_encoder_adapter/adapter_model.safetensors +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "_name_or_path": "",
+  "architectures": [
+    "VideoCCAM"
+  ],
+  "llm_name_or_path": "microsoft/Phi-3-mini-4k-instruct",
+  "vision_encoder_name_or_path": "google/siglip-so400m-patch14-384",
+  "auto_map": {
+    "AutoConfig": "configuration_videoccam.VideoCCAMConfig",
+    "AutoModel": "modeling_videoccam.VideoCCAM"
+  },
+  "image_token": "<image>",
+  "video_token": "<video>",
+  "vision_select_layer": -2,
+  "vision_max_chunk_size": 0,
+  "_attn_implementation": "flash_attention_2",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.44.0"
+}

configuration_videoccam.py ADDED Viewed

	@@ -0,0 +1,38 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+================================================
+@author: Jaron
+@time: 2024/08/21 17:51:45
+@email: fjjth98@163.com
+@description:
+================================================
+"""
+from transformers import PretrainedConfig
+class VideoCCAMConfig(PretrainedConfig):
+    model_type = 'videoccam'
+    _auto_class = 'AutoConfig'
+    def __init__(
+        self,
+        llm_name_or_path: str = None,
+        projector_name_or_path: str = None,
+        vision_encoder_name_or_path: str = None,
+        image_token: str = '<image>',
+        video_token: str = '<video>',
+        vision_select_layer: int = -2,
+        vision_max_chunk_size: int = 0,
+        _attn_implementation: str = 'flash_attention_2',
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.llm_name_or_path = llm_name_or_path
+        self.projector_name_or_path = projector_name_or_path
+        self.vision_encoder_name_or_path = vision_encoder_name_or_path
+        self.image_token = image_token
+        self.video_token = video_token
+        self.vision_select_layer = vision_select_layer
+        self.vision_max_chunk_size = vision_max_chunk_size
+        self._attn_implementation = _attn_implementation

llm_adapter/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: microsoft/Phi-3-mini-4k-instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.12.0

llm_adapter/adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "microsoft/Phi-3-mini-4k-instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 512,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "qkv_proj",
+    "o_proj",
+    "down_proj",
+    "gate_up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

llm_adapter/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ee23117aeba1157e3ad4b17301d45babb6a365bd467708a0875c0cb58ecedbe
+size 1610648408

modeling_videoccam.py ADDED Viewed

	@@ -0,0 +1,215 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+================================================
+@author: Jaron
+@time: 2024/08/21 17:41:52
+@email: fjjth98@163.com
+@description: Video-CCAM
+================================================
+"""
+import torch
+import os.path as osp
+from PIL import Image
+from peft import PeftModel
+from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, PreTrainedModel, SiglipVisionModel, SiglipImageProcessor, GenerationConfig
+from .configuration_videoccam import VideoCCAMConfig
+class VideoCCAM(PreTrainedModel):
+    config_class = VideoCCAMConfig
+    _auto_class = 'AutoModel'
+    supports_gradient_checkpointing = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    def __init__(self, config, device_map: str = 'auto'):
+        super().__init__(config)
+        self.image_token = config.image_token
+        self.video_token = config.video_token
+        self.vision_select_layer = config.vision_select_layer
+        self.vision_max_chunk_size = config.vision_max_chunk_size
+        self.gradient_checkpointing = False
+        self.projector = AutoModel.from_pretrained(
+            config.projector_name_or_path,
+            device_map=device_map,
+            trust_remote_code=True,
+            torch_dtype=config.torch_dtype,
+            attn_implementation='sdpa' if config._attn_implementation == 'flash_attention_2' else config._attn_implementation       # CCAM does not support flash_attention_2
+        )
+        self.llm = AutoModelForCausalLM.from_pretrained(
+            config.llm_name_or_path,
+            device_map=device_map,
+            torch_dtype=config.torch_dtype,
+            attn_implementation=config._attn_implementation
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            config.llm_name_or_path,
+            additional_special_tokens=[self.image_token, self.video_token]
+        )
+        self.generation_config = GenerationConfig.from_pretrained(config.llm_name_or_path)
+        self.image_token_id, self.video_token_id = self.tokenizer.convert_tokens_to_ids([self.image_token, self.video_token])
+        self.vision_encoder = SiglipVisionModel.from_pretrained(
+            config.vision_encoder_name_or_path,
+            device_map=device_map,
+            torch_dtype=config.torch_dtype,
+            attn_implementation=config._attn_implementation
+        )
+        self.image_processor = SiglipImageProcessor.from_pretrained(
+            config.vision_encoder_name_or_path
+        )
+    def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
+        if gradient_checkpointing_kwargs is None:
+            gradient_checkpointing_kwargs = dict(use_reentrant=False)
+        self.llm.gradient_checkpointing_enable(gradient_checkpointing_kwargs)
+        self.vision_encoder.gradient_checkpointing_enable(gradient_checkpointing_kwargs)
+    def forward_visual_embeds(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        if self.vision_select_layer in {-1, self.vision_encoder.config.num_hidden_layers}:
+            visual_embeds = self.vision_encoder(pixel_values, output_hidden_states=False).last_hidden_state
+        else:
+            visual_embeds = self.vision_encoder(pixel_values, output_hidden_states=True).hidden_states[self.vision_select_layer]
+        return visual_embeds
+    @torch.inference_mode
+    def chat(
+        self,
+        messages: list[list[dict]],
+        images: list[Image.Image, list[Image.Image]] = None,
+        generation_config = None,
+        batch_generate: bool = False,
+        visual_embeds: torch.Tensor = None,
+        return_visual_embeds: bool = False,
+        **kwargs
+    ):
+        if generation_config is None:
+            generation_config = self.generation_config
+        # compute visual embeds
+        if visual_embeds is None:
+            _images, split_size = [], []
+            for i in images:
+                if isinstance(i, Image.Image):
+                    _images.append(i)
+                    split_size.append(1)
+                else:
+                    _images += i
+                    split_size.append(len(i))
+            pixel_values = self.image_processor(
+                _images,
+                return_tensors='pt'
+            )['pixel_values'].to(
+                dtype=self.vision_encoder.get_input_embeddings().weight.dtype,
+                device=self.vision_encoder.get_input_embeddings().weight.device
+            )
+            if 0 < self.vision_max_chunk_size < len(pixel_values):
+                split_idx = list(range(0, len(pixel_values), self.vision_max_chunk_size)) + [-1]
+                visual_embeds = torch.cat([
+                    self.forward_visual_embeds(pixel_values[le:ri])
+                    for le, ri in zip(split_idx[:-1], split_idx[1:])
+                ], dim=0)
+            else:
+                visual_embeds = self.forward_visual_embeds(pixel_values)
+            visual_embeds = self.projector(visual_embeds.split(split_size, dim=0))
+        # compute textual embeds
+        device = self.llm.get_input_embeddings().weight.device
+        input_ids = self.tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True)     # list[list[int]]
+        _input_ids, split_idx = [], [0]
+        for i in input_ids:
+            _input_ids += i
+            split_idx.append(split_idx[-1] + len(i))
+        _input_ids = torch.tensor(_input_ids, dtype=torch.long, device=device)
+        visual_idx = torch.where((_input_ids == self.image_token_id) | (_input_ids == self.video_token_id))[0].tolist()
+        assert len(visual_idx) == len(visual_embeds), f'The number of visual tokens ({len(visual_idx)}) should be equal to the number of visual features ({len(visual_embeds)}).'
+        _input_ids[visual_idx] = 0      # avoid index overflow
+        _inputs_embeds = self.llm.get_input_embeddings()(_input_ids)
+        inputs_embeds, cur_visual_pointer = [], 0
+        for start_idx, end_idx in zip(split_idx[:-1], split_idx[1:]):
+            if cur_visual_pointer < len(visual_idx) and visual_idx[cur_visual_pointer] < end_idx:
+                mid_idx = visual_idx[cur_visual_pointer]
+                embeds = [_inputs_embeds[start_idx:mid_idx], visual_embeds[cur_visual_pointer]]
+                cur_visual_pointer += 1
+                while cur_visual_pointer < len(visual_idx) and visual_idx[cur_visual_pointer] < end_idx:
+                    embeds += [_inputs_embeds[mid_idx+1:visual_idx[cur_visual_pointer]], visual_embeds[cur_visual_pointer]]
+                    mid_idx = visual_idx[cur_visual_pointer]
+                    cur_visual_pointer += 1
+                embeds.append(_inputs_embeds[mid_idx+1:end_idx])
+                inputs_embeds.append(torch.cat(embeds, dim=0))
+            # Pure Text
+            else:
+                inputs_embeds.append(_inputs_embeds[start_idx:end_idx])
+        if batch_generate:
+            B, L = len(inputs_embeds), max(i.size(0) for i in inputs_embeds)
+            pad_embeds = self.llm.get_input_embeddings()(
+                torch.tensor([self.tokenizer.pad_token_id], dtype=torch.long, device=device)
+            )       # (1, C)
+            inputs_embeds_list = []
+            attention_mask = torch.zeros(B, L, dtype=torch.long, device=device)
+            for i, embeds in enumerate(inputs_embeds):
+                l = embeds.size(0)
+                inputs_embeds_list += [pad_embeds.expand(L - l, -1), embeds]
+                attention_mask[i, -l:] = 1
+            inputs_embeds = torch.cat(inputs_embeds_list, dim=0).view(B, L, -1)
+            output_ids = self.llm.generate(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                generation_config=generation_config,
+                **kwargs
+            )
+        else:
+            output_ids = []
+            for embeds in inputs_embeds:
+                output_ids.append(self.llm.generate(
+                    inputs_embeds=embeds[None],
+                    attention_mask=torch.ones(1, embeds.size(0), dtype=torch.long, device=device),
+                    generation_config=generation_config,
+                    **kwargs
+                )[0])
+        prediction = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+        if return_visual_embeds:
+            return prediction, visual_embeds
+        else:
+            return prediction
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        *args,
+        config: VideoCCAMConfig = None,
+        torch_dtype: torch.dtype = torch.bfloat16,
+        device_map: str = 'auto',
+        **kwargs
+    ) -> PreTrainedModel:
+        merge_pretrained_lora = kwargs.pop('merge_pretrained_lora', True)
+        config.torch_dtype = torch_dtype
+        config.projector_name_or_path = osp.join(pretrained_model_name_or_path, 'projector')
+        if osp.isdir(cur_path := osp.join(pretrained_model_name_or_path, 'llm')):
+            config.llm_name_or_path = cur_path
+        if osp.isdir(cur_path := osp.join(pretrained_model_name_or_path, 'vision_encoder')):
+            config.vision_encoder_name_or_path = cur_path
+        model = cls(config, device_map)
+        # load LoRA if exists
+        if osp.exists(cur_path := osp.join(pretrained_model_name_or_path, 'llm_adapter')):
+            model.llm = PeftModel.from_pretrained(model.llm, cur_path, device_map=device_map)
+            print(f'Load LLM adapter from {cur_path}.')
+            if merge_pretrained_lora:
+                model.llm = model.llm.merge_and_unload()
+        if osp.exists(cur_path := osp.join(pretrained_model_name_or_path, 'vision_encoder_adapter')):
+            model.vision_encoder = PeftModel.from_pretrained(model.vision_encoder, cur_path, device_map=device_map)
+            print(f'Load vision encoder adapter from {cur_path}.')
+            if merge_pretrained_lora:
+                model.vision_encoder = model.vision_encoder.merge_and_unload()
+        return model

projector/config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_name_or_path": "",
+  "architectures": [
+    "CCAMModel"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.1,
+  "auto_map": {
+    "AutoConfig": "configuration_ccam.CCAMConfig",
+    "AutoModel": "modeling_ccam.CCAMModel"
+  },
+  "cross_hidden_size": 1152,
+  "dropout": 0.1,
+  "hidden_act": "swiglu",
+  "hidden_size": 1024,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "mlp_bias": true,
+  "model_type": "ccam",
+  "num_heads": 16,
+  "num_key_value_heads": 16,
+  "num_query": 1024,
+  "output_size": 3072,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.44.0"
+}

projector/configuration_ccam.py ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+================================================
+@author: Jaron
+@time: 2024/07/10 19:43:31
+@email: fjjth98@163.com
+@description: Causal Cross-Attention Mask (CCAM)
+================================================
+"""
+from transformers import PretrainedConfig
+class CCAMConfig(PretrainedConfig):
+    model_type = 'ccam'
+    _auto_class = 'AutoConfig'
+    def __init__(
+        self,
+        num_query: int = 1024,
+        num_heads: int = 16,
+        hidden_size: int = 1024,
+        intermediate_size: int = 4096,
+        num_key_value_heads: int = 16,
+        dropout: float = 0.1,
+        mlp_bias: bool = True,
+        hidden_act: str = 'swiglu',
+        output_size: int = None,            # inferred from llm
+        attention_bias: bool = True,
+        layer_norm_eps: float = 1e-5,
+        cross_hidden_size: int = None,      # inferred from vision encoder
+        attention_dropout: float = 0.1,
+        _attn_implementation: str = 'sdpa',
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.dropout = dropout
+        self.mlp_bias = mlp_bias
+        self.num_query = num_query
+        self.num_heads = num_heads
+        self.hidden_act = hidden_act
+        self.hidden_size = hidden_size
+        self.output_size = output_size
+        self.layer_norm_eps = layer_norm_eps
+        self.attention_bias = attention_bias
+        self.intermediate_size = intermediate_size
+        self.cross_hidden_size = cross_hidden_size
+        self.attention_dropout = attention_dropout
+        self.num_key_value_heads = num_key_value_heads
+        self._attn_implementation = _attn_implementation

projector/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8da47515343b380025d3a3aa8835f28551eb14bf9a6b172b48b21bf2428108a4
+size 52993528

projector/modeling_ccam.py ADDED Viewed

	@@ -0,0 +1,196 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+================================================
+@author: Jaron
+@time: 2024/07/10 19:47:01
+@email: fjjth98@163.com
+@description: Causal Cross-Attention Mask (CCAM)
+================================================
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from transformers.activations import ACT2FN
+from .configuration_ccam import CCAMConfig
+class CCAMMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_act = config.hidden_act
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.output_size = config.output_size
+        if self.hidden_act == 'swiglu':
+            self.fc1 = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=config.mlp_bias)
+            self.act_fn = ACT2FN['silu']
+        else:
+            self.fc1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+            self.act_fn = ACT2FN[self.hidden_act]
+        self.fc2 = nn.Linear(self.intermediate_size, self.output_size, bias=config.mlp_bias)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        if self.hidden_act == 'swiglu':
+            gate, up = hidden_states.chunk(2, dim=-1)
+            hidden_states = self.act_fn(gate) * up
+        else:
+            hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class CCAMCrossAttention(nn.Module):
+    """Cross-attention layer of the CCAM projector.
+    Flash Attention 2 is not supported since the mask may be neither full nor causal. Only support `attn_implementation` as `eager` and `sdpa`.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.num_heads = config.num_heads
+        self.hidden_size = config.hidden_size
+        self.attention_bias = config.attention_bias
+        self.attention_dropout = config.attention_dropout
+        self.cross_hidden_size = config.cross_hidden_size
+        self.num_key_value_heads = config.num_key_value_heads
+        self.attn_implementation = config._attn_implementation
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        assert self.head_dim * self.num_heads == self.hidden_size, f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`: {self.num_heads}).'
+        self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=self.attention_bias)
+        self.k_proj = nn.Linear(self.cross_hidden_size, self.num_key_value_heads * self.head_dim, bias=self.attention_bias)
+        self.v_proj = nn.Linear(self.cross_hidden_size, self.num_key_value_heads * self.head_dim, bias=self.attention_bias)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=self.attention_bias)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,                # (B, Q, C)
+        cross_hidden_states: torch.Tensor,          # (B, L, C')
+        attention_mask: torch.Tensor = None         # (Q, L), '-inf' means masked, 0 means not masked
+    ) -> torch.Tensor:      # (B, Q, C)
+        B, Q, C = hidden_states.size()
+        query_states = self.q_proj(hidden_states)   # (B, Q, C)
+        key_states = self.k_proj(cross_hidden_states)
+        value_states = self.v_proj(cross_hidden_states)
+        L = key_states.size(1)
+        query_states = query_states.view(B, Q, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(B, L, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(B, L, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if self.num_key_value_groups > 1:
+            key_states = key_states.repeat_interleave(repeats=self.num_key_value_groups, dim=1)
+            value_states = value_states.repeat_interleave(repeats=self.num_key_value_groups, dim=1)
+        if self.attn_implementation == 'eager':
+            attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / self.head_dim ** 0.5    # (B, num_heads, Q, L)
+            if attention_mask is not None:
+                attn_weights = attn_weights + attention_mask.view(1, 1, Q, L)
+            # upcast attention to fp32
+            attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+            attn_weights = F.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+            attn_output = torch.matmul(attn_weights, value_states)      # (B, num_heads, Q, head_dim)
+        else:           # 'sdpa'
+            # there are bugs in torch <=2.1.0, requiring qkv as contiguous(), be careful
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                dropout_p=self.attention_dropout if self.training else 0.0
+            )
+        attn_output = attn_output.transpose(1, 2).reshape(B, Q, C)          # (B, Q, C)
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+class CCAMModel(PreTrainedModel):
+    """Causal Cross-Attention Mask Projector"""
+    config_class = CCAMConfig
+    _auto_class = 'AutoModel'
+    _supports_sdpa = True
+    _no_split_modules = ['CCAMCrossAttention', 'CCAMMLP']
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_query = config.num_query
+        self.hidden_size = config.hidden_size
+        self.output_size = config.output_size
+        self.cross_hidden_size = config.cross_hidden_size
+        self.query = nn.Parameter(torch.empty(1, self.num_query, self.hidden_size).normal_(mean=.0, std=.02))
+        self.pre_ccam = nn.Sequential(
+            nn.LayerNorm(self.hidden_size, eps=config.layer_norm_eps),
+            nn.Dropout(config.dropout)
+        )
+        self.ccam = CCAMCrossAttention(config)
+        self.post_ccam = nn.Sequential(
+            nn.LayerNorm(self.hidden_size, eps=config.layer_norm_eps),
+            nn.Dropout(config.dropout),
+            CCAMMLP(config)
+        )
+        self.post_init()
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=.0, std=.02)
+            if hasattr(module, "bias") and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+    def _get_mask(self, vision_hidden_state: torch.Tensor) -> torch.Tensor:      # (Q, T*L)
+        """Compute CCAM Mask for vision hidden state
+        Args:
+            vision_hidden_state (torch.Tensor): (T, L, C)
+        Returns:
+            torch.Tensor: (Q, T*L) -inf means masked
+        """
+        T, L, _ = vision_hidden_state.size()
+        dtype, device = vision_hidden_state.dtype, vision_hidden_state.device
+        base_mask = torch.zeros(T, T, dtype=dtype, device=device)
+        t = torch.arange(T, device=device)
+        base_mask.masked_fill_(t > t[:, None], float('-inf'))
+        attention_mask = torch.zeros(self.num_query, T * L, dtype=dtype, device=device)
+        attention_mask[:self.num_query // T * T] = torch.kron(base_mask, torch.ones(self.num_query // T, L, dtype=dtype, device=device))
+        return attention_mask
+    def forward(self, vision_hidden_states: list[torch.Tensor]) -> torch.Tensor:      # (B, Q, C)
+        """Forward function, do not collect batch due to the support of zero3
+        Args:
+            vision_hidden_states (list[torch.Tensor]): [(t0, L, C), (t1, L, C), ...]
+        Returns:
+            torch.Tensor: (B, Q, C)
+        """
+        output = []
+        for hidden_states in vision_hidden_states:
+            # reshape inputs and construct ccam masks
+            attention_mask = self._get_mask(hidden_states)    # (Q, ti * L)
+            # forward
+            x = self.pre_ccam(self.query)       # (1, Q, C)
+            x = self.ccam(
+                hidden_states=x,                # (1, Q, C)
+                cross_hidden_states=hidden_states.flatten(0, 1)[None],      # (1, ti * L, C')
+                attention_mask=attention_mask[None]     # (1, Q, ti * L)
+            ) + x
+            x = self.post_ccam(x)
+            output.append(x)
+        output = torch.cat(output, dim=0)
+        return output

vision_encoder_adapter/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: /group/40006/jaronfei/models/siglip-so400m-patch14-384
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.12.0

vision_encoder_adapter/adapter_config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "base_model_class": "SiglipVisionModel",
+    "parent_library": "transformers.models.siglip.modeling_siglip"
+  },
+  "base_model_name_or_path": "/group/40006/jaronfei/models/siglip-so400m-patch14-384",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "fc2",
+    "k_proj",
+    "q_proj",
+    "fc1",
+    "out_proj"
+  ],
+  "task_type": null,
+  "use_dora": false,
+  "use_rslora": false
+}

vision_encoder_adapter/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34b34dc74f3b34f15a23853d0ebec5d6d2afcad081a34b64d54baf0d012eb4a5
+size 71302696