Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

Qformer.py +1272 -0
__init__.py +0 -0
blip2.py +105 -0
config.json +62 -0
configuration_videochat2.py +453 -0
ltm/basis_functions.py +266 -0
ltm/long_term_attention_gibbs.py +315 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +934 -0
videochat2_it_hd_mistral.py +418 -0
vit.py +472 -0

Qformer.py ADDED Viewed

	@@ -0,0 +1,1272 @@

+"""
+ * Copyright (c) 2023, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+ * Based on huggingface code base
+ * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert
+"""
+import math
+from typing import Tuple
+import torch
+from torch import Tensor, device, nn
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+# from timm.layers import drop_path
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+)
+from transformers.modeling_utils import (
+    PreTrainedModel,
+    # apply_chunking_to_forward,
+    # find_pruneable_heads_and_indices,
+    # prune_linear_layer,
+)
+from transformers.pytorch_utils import (
+    # PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import logging
+from transformers.models.bert.configuration_bert import BertConfig
+from functools import partial
+from .ltm.long_term_attention_gibbs import LongTermAttention
+logger = logging.get_logger(__name__)
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
+        )
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size
+        )
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))
+        )
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        self.config = config
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        query_embeds=None,
+        past_key_values_length=0,
+    ):
+        if input_ids is not None:
+            seq_length = input_ids.size()[1]
+        else:
+            seq_length = 0
+        if position_ids is None:
+            position_ids = self.position_ids[
+                :, past_key_values_length : seq_length + past_key_values_length
+            ].clone()
+        if input_ids is not None:
+            embeddings = self.word_embeddings(input_ids)
+            if self.position_embedding_type == "absolute":
+                position_embeddings = self.position_embeddings(position_ids)
+                embeddings = embeddings + position_embeddings
+            if query_embeds is not None:
+                embeddings = torch.cat((query_embeds, embeddings), dim=1)
+        else:
+            embeddings = query_embeds
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, is_cross_attention):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+            config, "embedding_size"
+        ):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.is_cross_attention=is_cross_attention
+        self.alpha = config.alpha
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_width, self.all_head_size)
+            self.value = nn.Linear(config.encoder_width, self.all_head_size)
+            long_term_attn_mechanism = partial(LongTermAttention,
+                                    attn_num_basis=config.num_basis,
+                                    head_size=self.attention_head_size,
+                                    length=config.encoder_width,
+                                    target_len=config.encoder_width,
+                                    attn_func="softmax",
+                                    infinite_memory=True,
+                                    n_layers=2,
+                                    attn_drop=0.1,
+                                    n_heads=self.num_attention_heads,
+                                    d_model=self.all_head_size,
+                                    affines=True,
+                                    mask=True,
+                                    mask_type="cnn",
+                                    kl_regularizer=False,
+                                    sigma_0=None,
+                                    mu_0=None,
+                                    sticky_memories=config.sticky,
+                                    continuous=True,
+                                    sigmas = 1,
+                                    tau = config.tau,
+                                    proj_key=self.key,
+                                    proj_value=self.value
+                                    )
+            self.long_term_attention=long_term_attn_mechanism()
+        if not is_cross_attention:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1, self.attention_head_size
+            )
+        self.save_attention = False
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+    def get_attn_gradients(self):
+        return self.attn_gradients
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+    def get_attention_map(self):
+        return self.attention_map
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+        self,
+        hidden_states,
+        position_embedding_ext,
+        layer,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        new_video=False,
+    ):
+        mixed_query_layer = self.query(hidden_states) #[1, 32, 768]
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = self.is_cross_attention
+        if is_cross_attention:
+            bsz, p, h = encoder_hidden_states.shape
+            self.long_term_attention.length = p
+            self.long_term_attention.target_len = p
+            if self.alpha != 1.0:
+                a_long_term = self.long_term_attention(encoder_hidden_states, mixed_query_layer, new_doc=new_video, layer_n=layer).detach()
+            else:
+                a_long_term = 0
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)#[1,12,32,64]
+        past_key_value = (key_layer, value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device
+            ).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device
+            ).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1
+            )
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype
+            )  # fp16 compatibility
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
+                relative_position_scores_key = torch.einsum(
+                    "bhrd,lrd->bhlr", key_layer, positional_embedding
+                )
+                attention_scores = (
+                    attention_scores
+                    + relative_position_scores_query
+                    + relative_position_scores_key
+                )
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+        context_layer = torch.matmul(attention_probs_dropped, value_layer) #[1, 12, 32, 64]
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        if is_cross_attention:
+            context_layer = self.alpha*context_layer + (1-self.alpha)*a_long_term
+        outputs = (
+            (context_layer, attention_probs) if output_attentions else (context_layer,)
+        )
+        outputs = outputs + (past_key_value,)
+        return outputs
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.self = BertSelfAttention(config, is_cross_attention)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads,
+            self.self.num_attention_heads,
+            self.self.attention_head_size,
+            self.pruned_heads,
+        )
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = (
+            self.self.attention_head_size * self.self.num_attention_heads
+        )
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+        self,
+        hidden_states,
+        position_embedding_ext,
+        layer,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        new_video=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            position_embedding_ext,
+            layer,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+            new_video=new_video,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[
+            1:
+        ]  # add attentions if we output them
+        return outputs
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertLayer(nn.Module):
+    def __init__(self, config, layer_num):
+        super().__init__()
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.layer_num = layer_num
+        if (
+            self.config.add_cross_attention
+            and layer_num % self.config.cross_attention_freq == 0
+        ):
+            self.crossattention = BertAttention(
+                config, is_cross_attention=self.config.add_cross_attention
+            )
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+        self.intermediate_query = BertIntermediate(config)
+        self.output_query = BertOutput(config)
+    def forward(
+        self,
+        hidden_states,
+        position_embedding_ext,
+        layer,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+        new_video=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = (
+            past_key_value[:2] if past_key_value is not None else None
+        )
+        self_attention_outputs = self.attention(
+            hidden_states,
+            position_embedding_ext,
+            layer,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+            new_video=new_video,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+        present_key_value = self_attention_outputs[-1]
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+            if self.has_cross_attention:
+                assert (
+                    encoder_hidden_states is not None
+                ), "encoder_hidden_states must be given for cross-attention layers"
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    position_embedding_ext,
+                    layer,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                    new_video=new_video,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                outputs = (
+                    outputs + cross_attention_outputs[1:-1]
+                )  # add cross attentions if we output attention weights
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+        outputs = outputs + (present_key_value,)
+        return outputs
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [BertLayer(config, i) for i in range(config.num_hidden_layers)]
+        )
+    def forward(
+        self,
+        hidden_states,
+        position_embedding_ext,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+        new_video=False,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+            () if output_attentions and self.config.add_cross_attention else None
+        )
+        next_decoder_cache = () if use_cache else None
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(
+                            *inputs, past_key_value, output_attentions, query_length
+                        )
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    position_embedding_ext,
+                    i,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    new_video=new_video
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    position_embedding_ext,
+                    i,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                    new_video=new_video,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = BertConfig
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+class BertModel(BertPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+    def __init__(self, config, add_pooling_layer=False):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+        self.init_weights()
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    def get_extended_attention_mask(
+        self,
+        attention_mask: Tensor,
+        input_shape: Tuple[int],
+        device: device,
+        is_decoder: bool,
+        has_query: bool = False,
+    ) -> Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+        Arguments:
+            attention_mask (:obj:`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (:obj:`Tuple[int]`):
+                The shape of the input to the model.
+            device: (:obj:`torch.device`):
+                The device of the input to the model.
+        Returns:
+            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = (
+                    seq_ids[None, None, :].repeat(batch_size, seq_length, 1)
+                    <= seq_ids[None, :, None]
+                )
+                # add a prefix ones mask to the causal mask
+                # causal and attention masks must have same type with pytorch version < 1.3
+                causal_mask = causal_mask.to(attention_mask.dtype)
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    if has_query:  # UniLM style attention mask
+                        causal_mask = torch.cat(
+                            [
+                                torch.zeros(
+                                    (batch_size, prefix_seq_len, seq_length),
+                                    device=device,
+                                    dtype=causal_mask.dtype,
+                                ),
+                                causal_mask,
+                            ],
+                            axis=1,
+                        )
+                    causal_mask = torch.cat(
+                        [
+                            torch.ones(
+                                (batch_size, causal_mask.shape[1], prefix_seq_len),
+                                device=device,
+                                dtype=causal_mask.dtype,
+                            ),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+                extended_attention_mask = (
+                    causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+                )
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=self.dtype
+        )  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+    def forward(
+        self,
+        input_ids=None,
+        position_embedding_ext=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=False,
+        new_video=False,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if input_ids is None:
+            assert (
+                query_embeds is not None
+            ), "You have to specify query_embeds when input_ids is None"
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] - self.config.query_length
+            if past_key_values is not None
+            else 0
+        )
+        query_length = query_embeds.shape[1] if query_embeds is not None else 0
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            query_embeds=query_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        input_shape = embedding_output.size()[:-1]
+        batch_size, seq_length = input_shape
+        device = embedding_output.device
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)), device=device
+            )
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if is_decoder:
+            extended_attention_mask = self.get_extended_attention_mask(
+                attention_mask,
+                input_ids.shape,
+                device,
+                is_decoder,
+                has_query=(query_embeds is not None),
+            )
+        else:
+            extended_attention_mask = self.get_extended_attention_mask(
+                attention_mask, input_shape, device, is_decoder
+            )
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[
+                    0
+                ].size()
+            else:
+                (
+                    encoder_batch_size,
+                    encoder_sequence_length,
+                    _,
+                ) = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [
+                    self.invert_attention_mask(mask) for mask in encoder_attention_mask
+                ]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(
+                    encoder_attention_mask
+                )
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(
+                    encoder_attention_mask
+                )
+        else:
+            encoder_extended_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        encoder_outputs = self.encoder(
+            embedding_output,
+            position_embedding_ext,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+            new_video = new_video
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = (
+            self.pooler(sequence_output) if self.pooler is not None else None
+        )
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+class BertLMHeadModel(BertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+        self.init_weights()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=True,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=True,
+        reduction="mean",
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+            >>> config = BertConfig.from_pretrained("bert-base-cased")
+            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if labels is not None:
+            use_cache = False
+        if past_key_values is not None:
+            query_embeds = None
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            query_embeds=query_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+        )
+        sequence_output = outputs[0]
+        if query_embeds is not None:
+            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+        prediction_scores = self.cls(sequence_output)
+        if return_logits:
+            return prediction_scores[:, :-1, :].contiguous()
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1),
+            )
+            if reduction == "none":
+                lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, query_embeds, past=None, attention_mask=None, **model_kwargs
+    ):
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+        query_mask = input_ids.new_ones(query_embeds.shape[:-1])
+        attention_mask = torch.cat([query_mask, attention_mask], dim=-1)
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "query_embeds": query_embeds,
+            "attention_mask": attention_mask,
+            "past_key_values": past,
+            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
+            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
+            "is_decoder": True,
+        }
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx) for past_state in layer_past
+                ),
+            )
+        return reordered_past
+class BertForMaskedLM(BertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+        self.init_weights()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=False,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            query_embeds=query_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+        )
+        if query_embeds is not None:
+            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+        prediction_scores = self.cls(sequence_output)
+        if return_logits:
+            return prediction_scores
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)
+            )
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return (
+                ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+            )
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

__init__.py ADDED Viewed

File without changes

blip2.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""
+ Copyright (c) 2023, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import contextlib
+import os
+import logging
+import torch
+import torch.nn as nn
+from .Qformer import BertConfig, BertLMHeadModel
+from .vit import build_vit
+from transformers import BertTokenizer
+logger = logging.getLogger(__name__)
+from transformers import PreTrainedModel, PretrainedConfig, AutoConfig
+# class Blip2Base(nn.Module):
+class Blip2Base(PreTrainedModel):
+    def __init__(self, config={}):
+        cfg=PretrainedConfig()
+        if isinstance(config,(PretrainedConfig,AutoConfig)):
+            cfg.update(config.to_dict())
+        else:
+            cfg.update(dict(config))
+        super().__init__(cfg)
+    @classmethod
+    def init_tokenizer(cls, truncation_side="right"):
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", truncation_side=truncation_side, local_files_only=True)
+        tokenizer.add_special_tokens({"bos_token": "[DEC]"})
+        return tokenizer
+    @property
+    def device(self):
+        return list(self.parameters())[0].device
+    def maybe_autocast(self, dtype=torch.float16):
+        # if on cpu, don't use autocast
+        # if on gpu, use autocast with dtype if provided, otherwise use torch.float16
+        enable_autocast = self.device != torch.device("cpu")
+        if enable_autocast:
+            return torch.cuda.amp.autocast(dtype=dtype)
+        else:
+            return contextlib.nullcontext()
+    @classmethod
+    def init_Qformer(
+        cls,
+        num_query_token, vision_width,
+        qformer_hidden_dropout_prob=0.1,
+        qformer_attention_probs_dropout_prob=0.1,
+        qformer_drop_path_rate=0.,
+    ):
+        encoder_config = BertConfig.from_pretrained("bert-base-uncased", local_files_only=True)
+        encoder_config.encoder_width = vision_width
+        # insert cross-attention layer every other block
+        encoder_config.add_cross_attention = True
+        encoder_config.cross_attention_freq = 2
+        encoder_config.query_length = num_query_token
+        encoder_config.hidden_dropout_prob = qformer_hidden_dropout_prob
+        encoder_config.attention_probs_dropout_prob = qformer_attention_probs_dropout_prob
+        encoder_config.drop_path_list = [x.item() for x in torch.linspace(0, qformer_drop_path_rate, encoder_config.num_hidden_layers)]
+        logger.info(f"Drop_path:{encoder_config.drop_path_list}")
+        logger.info(encoder_config)
+        Qformer = BertLMHeadModel(config=encoder_config)
+        query_tokens = nn.Parameter(
+            torch.zeros(1, num_query_token, encoder_config.hidden_size)
+        )
+        query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range)
+        return Qformer, query_tokens
+    @classmethod
+    def init_vision_encoder_umt(self, config):
+        """build vision encoder
+        Returns: (vision_encoder, vision_layernorm). Each is a `nn.Module`.
+        """
+        vision_encoder = build_vit(config)
+        if config.vision_encoder.vit_add_ln:
+            vision_layernorm = nn.LayerNorm(config.vision_encoder.encoder_embed_dim, eps=1e-12)
+        else:
+            vision_layernorm = nn.Identity()
+        return vision_encoder, vision_layernorm
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)

config.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  "add_second_msg": true,
+  "architectures": [
+    "VideoChat2_it_hd_mistral"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_videochat2.Config",
+    "AutoModel": "videochat2_it_hd_mistral.VideoChat2_it_hd_mistral"
+  },
+  "dynamic_config": {
+    "add_global": true,
+    "hd_num": 6,
+    "local_size": 224,
+    "padding": false
+  },
+  "end_token": "</Video>",
+  "extra_num_query_token": 64,
+  "freeze_qformer": false,
+  "freeze_vit": false,
+  "img_end_token": "</Image>",
+  "img_start_token": "<Image>",
+  "lora_alpha": 32,
+  "lora_dropout": 0.1,
+  "lora_r": 16,
+  "low_resource": false,
+  "max_txt_len": 512,
+  "mistral_model_path": "mistralai/Mistral-7B-Instruct-v0.2",
+  "model_cls": "VideoChat2_it_hd_mistral",
+  "num_query_token": 32,
+  "qformer_attention_probs_dropout_prob": 0.1,
+  "qformer_drop_path_rate": 0.2,
+  "qformer_hidden_dropout_prob": 0.1,
+  "qformer_text_input": true,
+  "random_shuffle": true,
+  "return_question_instruction": false,
+  "start_token": "<Video>",
+  "system": "",
+  "torch_dtype": "float32",
+  "transformers_version": "4.44.2",
+  "use_flash_attention": false,
+  "use_lora": false,
+  "videochat2_model_path": "",
+  "vision_encoder": {
+    "checkpoint_num": 18,
+    "ckpt_num_frame": 4,
+    "d_model": 1024,
+    "drop_path_rate": 0.0,
+    "encoder_depth": 24,
+    "encoder_embed_dim": 1024,
+    "encoder_num_heads": 16,
+    "img_size": 224,
+    "name": "vit_l14",
+    "num_frames": 4,
+    "patch_size": 16,
+    "pretrained": "",
+    "return_index": -2,
+    "tubelet_size": 1,
+    "use_checkpoint": true,
+    "vit_add_ln": true
+  },
+  "vit_blip_model_path": ""
+}

configuration_videochat2.py ADDED Viewed

	@@ -0,0 +1,453 @@

+from __future__ import annotations
+import argparse
+import ast
+import json
+import os
+import os.path as osp
+import re
+import shutil
+import sys
+import tempfile
+from copy import deepcopy
+from importlib import import_module
+import yaml
+__all__ = ["Config", "pretty_text"]
+BASE_KEY = "_base_"
+# BASE_CONFIG = {"OUTPUT_DIR": "./workspace", "SESSION": "base", "LOG_FILE": "log.txt"}
+BASE_CONFIG = {}
+cfg = None
+class EasyDict(dict):
+    """
+    Get attributes
+    >>> d = EasyDict({'foo':3})
+    >>> d['foo']
+    3
+    >>> d.foo
+    3
+    >>> d.bar
+    Traceback (most recent call last):
+    ...
+    AttributeError: 'EasyDict' object has no attribute 'bar'
+    Works recursively
+    >>> d = EasyDict({'foo':3, 'bar':{'x':1, 'y':2}})
+    >>> isinstance(d.bar, dict)
+    True
+    >>> d.bar.x
+    1
+    Bullet-proof
+    >>> EasyDict({})
+    {}
+    >>> EasyDict(d={})
+    {}
+    >>> EasyDict(None)
+    {}
+    >>> d = {'a': 1}
+    >>> EasyDict(**d)
+    {'a': 1}
+    Set attributes
+    >>> d = EasyDict()
+    >>> d.foo = 3
+    >>> d.foo
+    3
+    >>> d.bar = {'prop': 'value'}
+    >>> d.bar.prop
+    'value'
+    >>> d
+    {'foo': 3, 'bar': {'prop': 'value'}}
+    >>> d.bar.prop = 'newer'
+    >>> d.bar.prop
+    'newer'
+    Values extraction
+    >>> d = EasyDict({'foo':0, 'bar':[{'x':1, 'y':2}, {'x':3, 'y':4}]})
+    >>> isinstance(d.bar, list)
+    True
+    >>> from operator import attrgetter
+    >>> map(attrgetter('x'), d.bar)
+    [1, 3]
+    >>> map(attrgetter('y'), d.bar)
+    [2, 4]
+    >>> d = EasyDict()
+    >>> d.keys()
+    []
+    >>> d = EasyDict(foo=3, bar=dict(x=1, y=2))
+    >>> d.foo
+    3
+    >>> d.bar.x
+    1
+    Still like a dict though
+    >>> o = EasyDict({'clean':True})
+    >>> o.items()
+    [('clean', True)]
+    And like a class
+    >>> class Flower(EasyDict):
+    ...     power = 1
+    ...
+    >>> f = Flower()
+    >>> f.power
+    1
+    >>> f = Flower({'height': 12})
+    >>> f.height
+    12
+    >>> f['power']
+    1
+    >>> sorted(f.keys())
+    ['height', 'power']
+    update and pop items
+    >>> d = EasyDict(a=1, b='2')
+    >>> e = EasyDict(c=3.0, a=9.0)
+    >>> d.update(e)
+    >>> d.c
+    3.0
+    >>> d['c']
+    3.0
+    >>> d.get('c')
+    3.0
+    >>> d.update(a=4, b=4)
+    >>> d.b
+    4
+    >>> d.pop('a')
+    4
+    >>> d.a
+    Traceback (most recent call last):
+    ...
+    AttributeError: 'EasyDict' object has no attribute 'a'
+    """
+    def __init__(self, d=None, **kwargs):
+        if d is None:
+            d = {}
+        if kwargs:
+            d.update(**kwargs)
+        for k, v in d.items():
+            setattr(self, k, v)
+        # Class attributes
+        for k in self.__class__.__dict__.keys():
+            if not (k.startswith("__") and k.endswith("__")) and not k in ("update", "pop"):
+                setattr(self, k, getattr(self, k))
+    def __setattr__(self, name, value):
+        if isinstance(value, (list, tuple)):
+            value = [self.__class__(x) if isinstance(x, dict) else x for x in value]
+        elif isinstance(value, dict) and not isinstance(value, self.__class__):
+            value = self.__class__(value)
+        super(EasyDict, self).__setattr__(name, value)
+        super(EasyDict, self).__setitem__(name, value)
+    __setitem__ = __setattr__
+    def update(self, e=None, **f):
+        d = e or dict()
+        d.update(f)
+        for k in d:
+            setattr(self, k, d[k])
+    def pop(self, k, d=None):
+        if hasattr(self, k):
+            delattr(self, k)
+        return super(EasyDict, self).pop(k, d)
+from transformers import PretrainedConfig
+class Config(PretrainedConfig):
+    _auto_class = "AutoConfig"
+    """config"""
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.cfg=EasyDict(kwargs)
+    @classmethod
+    def pretty_text(cls, cfg: dict, indent=2) -> str:
+        """format dict to a string
+        Args:
+            cfg (EasyDict): the params.
+        Returns: The string to display.
+        """
+        msg = "{\n"
+        for i, (k, v) in enumerate(cfg.items()):
+            if isinstance(v, dict):
+                v = cls.pretty_text(v, indent + 4)
+            spaces = " " * indent
+            msg += spaces + "{}: {}".format(k, v)
+            if i == len(cfg) - 1:
+                msg += " }"
+            else:
+                msg += "\n"
+        return msg
+    @classmethod
+    def dump(cls, cfg, savepath=None):
+        """dump cfg to `json` file.
+        Args:
+            cfg (dict): The dict to dump.
+            savepath (str): The filepath to save the dumped dict.
+        Returns: TODO
+        """
+        if savepath is None:
+            savepath = osp.join(cfg.WORKSPACE, "config.json")
+        json.dump(cfg, open(savepath, "w"), indent=2)
+    @classmethod
+    def get_config(cls, default_config: dict = None, config_file: str=''):
+        """get a `Config` instance.
+        Args:
+            default_config (dict): The default config. `default_config` will be overrided
+                by config file `--cfg`, `--cfg` will be overrided by commandline args.
+        Returns: an EasyDict.
+        """
+        global cfg
+        if cfg is not None:
+            return cfg
+        # define arg parser.
+        parser = argparse.ArgumentParser()
+        # parser.add_argument("--cfg", help="load configs from yaml file", default="", type=str)
+        parser.add_argument(
+            "--config_file", default='your config file', help="the configuration file to load. support: .yaml, .json, .py"
+        )
+        parser.add_argument(
+            "--opts",
+            default=None,
+            nargs="*",
+            help="overrided configs. List. Format: 'key1 name1 key2 name2'",
+        )
+        # args = parser.parse_args()
+        args = parser.parse_known_args()[0] # for jupyterrrrrrrrrrrrrrrrrrrrrrrrr
+        args.config_file="/mnt/petrelfs/shiyansong/WEIGHT/UMT/l16_25m.py"
+        if config_file:
+            args.config_file=config_file
+        cfg = EasyDict(BASE_CONFIG)
+        # if default_config: # new------------------------------------
+        #     cfg = merge_a_into_b(default_config, cfg)
+        if osp.isfile(args.config_file):
+            cfg_from_file = cls.from_file(args.config_file)
+            cfg = merge_a_into_b(cfg_from_file, cfg)
+        if args.opts:
+            cfg = cls.merge_list(cfg, args.opts)
+        cfg = eval_dict_leaf(cfg)
+        # update some keys to make them show at the last
+        for k in BASE_CONFIG:
+            cfg[k] = cfg.pop(k)
+        return cfg
+    @classmethod
+    def from_file(cls, filepath: str) -> EasyDict:
+        """Build config from file. Supported filetypes: `.py`,`.yaml`,`.json`.
+        Args:
+            filepath (str): The config file path.
+        Returns: TODO
+        """
+        filepath = osp.abspath(osp.expanduser(filepath))
+        if not osp.isfile(filepath):
+            raise IOError(f"File does not exist: {filepath}")
+        if filepath.endswith(".py"):
+            sys.path.insert(0, osp.dirname(filepath))
+            mod = import_module(osp.splitext(osp.basename(filepath))[0])
+            cfg_dict = {
+                name: value
+                for name, value in mod.__dict__.items()
+                if not name.startswith("__")
+            }
+            # I've no idea what the fuck is this, fuck it!!!
+            # with tempfile.TemporaryDirectory() as temp_config_dir:
+            #     print(temp_config_dir, filepath)
+            #     print(f"Copying {osp.dirname(filepath)} to {osp.join(temp_config_dir, 'tmp_config')}")
+            #     shutil.copytree(osp.dirname(filepath), osp.join(temp_config_dir, "tmp_config"))
+            #     sys.path.insert(0, temp_config_dir)
+            #     mod = import_module("tmp_config." + osp.splitext(osp.basename(filepath))[0])
+            #     # mod = import_module(temp_module_name)
+            #     sys.path.pop(0)
+            #     cfg_dict = {
+            #         name: value
+            #         for name, value in mod.__dict__.items()
+            #         if not name.startswith("__")
+            #     }
+            #     print("Removing")
+            #     for k in list(sys.modules.keys()):
+            #         if "tmp_config" in k:
+            #             del sys.modules[k]
+        elif filepath.endswith((".yml", ".yaml")):
+            cfg_dict = yaml.load(open(filepath, "r"), Loader=yaml.Loader)
+        elif filepath.endswith(".json"):
+            cfg_dict = json.load(open(filepath, "r"))
+        else:
+            raise IOError("Only py/yml/yaml/json type are supported now!")
+        cfg_text = filepath + "\n"
+        with open(filepath, "r") as f:
+            cfg_text += f.read()
+        if BASE_KEY in cfg_dict:  # load configs in `BASE_KEY`
+            cfg_dir = osp.dirname(filepath)
+            base_filename = cfg_dict.pop(BASE_KEY)
+            base_filename = (
+                base_filename if isinstance(base_filename, list) else [base_filename]
+            )
+            cfg_dict_list = list()
+            for f in base_filename:
+                _cfg_dict = Config.from_file(osp.join(cfg_dir, f))
+                cfg_dict_list.append(_cfg_dict)
+            base_cfg_dict = dict()
+            for c in cfg_dict_list:
+                if len(base_cfg_dict.keys() & c.keys()) > 0:
+                    raise KeyError("Duplicate key is not allowed among bases")
+                base_cfg_dict.update(c)
+            cfg_dict = merge_a_into_b(cfg_dict, base_cfg_dict)
+        return EasyDict(cfg_dict)
+    @classmethod
+    def merge_list(cls, cfg, opts: list):
+        """merge commandline opts.
+        Args:
+            cfg: (dict): The config to be merged.
+            opts (list): The list to merge. Format: [key1, name1, key2, name2,...].
+                The keys can be nested. For example, ["a.b", v] will be considered
+                as `dict(a=dict(b=v))`.
+        Returns: dict.
+        """
+        assert len(opts) % 2 == 0, f"length of opts must be even. Got: {opts}"
+        for i in range(0, len(opts), 2):
+            full_k, v = opts[i], opts[i + 1]
+            keys = full_k.split(".")
+            sub_d = cfg
+            for i, k in enumerate(keys):
+                if not hasattr(sub_d, k):
+                    raise ValueError(f"The key {k} not exist in the config. Full key:{full_k}")
+                if i != len(keys) - 1:
+                    sub_d = sub_d[k]
+                else:
+                    sub_d[k] = v
+        return cfg
+def merge_a_into_b(a, b, inplace=False):
+    """The values in a will override values in b.
+    Args:
+        a (dict): source dict.
+        b (dict): target dict.
+    Returns: dict. recursively merge dict a into dict b.
+    """
+    if not inplace:
+        b = deepcopy(b)
+    for key in a:
+        if key in b:
+            if isinstance(a[key], dict) and isinstance(b[key], dict):
+                b[key] = merge_a_into_b(a[key], b[key], inplace=True)
+            else:
+                b[key] = a[key]
+        else:
+            b[key] = a[key]
+    return b
+def eval_dict_leaf(d, orig_dict=None):
+    """eval values of dict leaf.
+    Args:
+        d (dict): The dict to eval.
+    Returns: dict.
+    """
+    if orig_dict is None:
+        orig_dict = d
+    for k, v in d.items():
+        if not isinstance(v, dict):
+            d[k] = eval_string(v, orig_dict)
+        else:
+            eval_dict_leaf(v, orig_dict)
+    return d
+def eval_string(string, d):
+    """automatically evaluate string to corresponding types.
+    For example:
+        not a string  -> return the original input
+        '0'  -> 0
+        '0.2' -> 0.2
+        '[0, 1, 2]' -> [0,1,2]
+        'eval(1+2)' -> 3
+        'eval(range(5))' -> [0,1,2,3,4]
+        '${a}' -> d.a
+    Args:
+        string (str): The value to evaluate.
+        d (dict): The
+    Returns: the corresponding type
+    """
+    if not isinstance(string, str):
+        return string
+    # if len(string) > 1 and string[0] == "[" and string[-1] == "]":
+    #     return eval(string)
+    if string[0:5] == "eval(":
+        return eval(string[5:-1])
+    s0 = string
+    s1 = re.sub(r"\${(.*)}", r"d.\1", s0)
+    if s1 != s0:
+        while s1 != s0:
+            s0 = s1
+            s1 = re.sub(r"\${(.*)}", r"d.\1", s0)
+        return eval(s1)
+    try:
+        v = ast.literal_eval(string)
+    except:
+        v = string
+    return v
+if __name__=="__main__":
+    d=EasyDict({"1":2,"2":3})
+    cfg=Config({"1":2,"2":3})

ltm/basis_functions.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import torch
+import math
+class BasisFunctions(object):
+    def __init__(self):
+        pass
+    def __len__(self):
+        """Number of basis functions."""
+        pass
+    def evaluate(self, t):
+        pass
+    def integrate_t2_times_psi(self, a, b):
+        """Compute integral int_a^b (t**2) * psi(t)."""
+        pass
+    def integrate_t_times_psi(self, a, b):
+        """Compute integral int_a^b t * psi(t)."""
+        pass
+    def integrate_psi(self, a, b):
+        """Compute integral int_a^b psi(t)."""
+        pass
+class PowerBasisFunctions(BasisFunctions):
+    """Function phi(t) = t**degree."""
+    def __init__(self, degree):
+        self.degree = degree.unsqueeze(0)
+    def __len__(self):
+        """Number of basis functions."""
+        return self.degree.size(1)
+    def evaluate(self, t):
+        return t**self.degree
+    def integrate_t2_times_psi(self, a, b):
+        """Compute integral int_a^b (t**2) * psi(t)."""
+        return (b**(self.degree + 3) - a**(self.degree + 3)) / (self.degree + 3)
+    def integrate_t_times_psi(self, a, b):
+        """Compute integral int_a^b t * psi(t)."""
+        return (b**(self.degree + 2) - a**(self.degree + 2)) / (self.degree + 2)
+    def integrate_psi(self, a, b):
+        """Compute integral int_a^b psi(t)."""
+        return (b**(self.degree + 1) - a**(self.degree + 1)) / (self.degree + 1)
+    def __repr__(self):
+        return f"PowerBasisFunction(degree={self.degree})"
+class SineBasisFunctions(BasisFunctions):
+    """Function phi(t) = sin(omega*t)."""
+    def __init__(self, omega):
+        self.omega = omega.unsqueeze(0)
+    def __repr__(self):
+        return f"SineBasisFunction(omega={self.omega})"
+    def __len__(self):
+        """Number of basis functions."""
+        return self.omega.size(1)
+    def evaluate(self, t):
+        return torch.sin(self.omega*t)
+    def integrate_t2_times_psi(self, a, b):
+        """Compute integral int_a^b (t**2) * psi(t)."""
+        # The antiderivative of (t**2)*sin(omega*t) is
+        # ((2-(t**2)*(omega**2))*cos(omega*t) + 2*omega*t*sin(omega*t)) / omega**3.  # noqa
+        return ((2-(b**2)*(self.omega**2))*torch.cos(self.omega*b)
+                + 2*self.omega*b*torch.sin(self.omega*b)
+                - (2-(a**2)*(self.omega**2))*torch.cos(self.omega*a)
+                - 2*self.omega*a*torch.sin(self.omega*a)
+                ) / (self.omega**3)
+    def integrate_t_times_psi(self, a, b):
+        """Compute integral int_a^b t * psi(t)."""
+        # The antiderivative of t*sin(omega*t) is
+        # (sin(omega*t) - omega*t*cos(omega*t)) / omega**2.
+        return (torch.sin(self.omega*b) - self.omega*b*torch.cos(self.omega*b)
+                - torch.sin(self.omega*a) + self.omega*a*torch.cos(self.omega*a)
+                ) / (self.omega**2)
+    def integrate_psi(self, a, b):
+        """Compute integral int_a^b psi(t)."""
+        # The antiderivative of sin(omega*t) is -cos(omega*t)/omega.
+        return (-torch.cos(self.omega*b) + torch.cos(self.omega*a)) / self.omega
+class CosineBasisFunctions(BasisFunctions):
+    """Function phi(t) = cos(omega*t)."""
+    def __init__(self, omega):
+        self.omega = omega.unsqueeze(0)
+    def __repr__(self):
+        return f"CosineBasisFunction(omega={self.omega})"
+    def __len__(self):
+        """Number of basis functions."""
+        return self.omega.size(1)
+    def evaluate(self, t):
+        return torch.cos(self.omega*t)
+    def integrate_t2_times_psi(self, a, b):
+        """Compute integral int_a^b (t**2) * psi(t)."""
+        # The antiderivative of (t**2)*cos(omega*t) is
+        # (((t**2)*(omega**2)-2)*cos(omega*t) + 2*omega*t*sin(omega*t)) / omega**3.  # noqa
+        return (((b**2)*(self.omega**2)-2)*torch.sin(self.omega*b)
+                + 2*self.omega*b*torch.cos(self.omega*b)
+                - ((a**2)*(self.omega**2)-2)*torch.sin(self.omega*a)
+                - 2*self.omega*a*torch.cos(self.omega*a)
+                ) / (self.omega**3)
+    def integrate_t_times_psi(self, a, b):
+        """Compute integral int_a^b t * psi(t)."""
+        # The antiderivative of t*cos(omega*t) is
+        # (cos(omega*t) + omega*t*sin(omega*t)) / omega**2.
+        return (torch.cos(self.omega*b) + self.omega*b*torch.sin(self.omega*b)
+                - torch.cos(self.omega*a) - self.omega*a*torch.sin(self.omega*a)
+                ) / (self.omega**2)
+    def integrate_psi(self, a, b):
+        """Compute integral int_a^b psi(t)."""
+        # The antiderivative of cos(omega*t) is sin(omega*t)/omega.
+        return (torch.sin(self.omega*b) - torch.sin(self.omega*a)) / self.omega
+class GaussianBasisFunctions(BasisFunctions):
+    """Function phi(t) = Gaussian(t; mu, sigma_sq)."""
+    def __init__(self, mu, sigma):
+        self.mu = mu.unsqueeze(0)
+        self.sigma = sigma.unsqueeze(0)
+    def __repr__(self):
+        return f"GaussianBasisFunction(mu={self.mu}, sigma={self.sigma})"
+    def __len__(self):
+        """Number of basis functions."""
+        return self.mu.size(1)
+    def _phi(self, t):
+        return 1. / math.sqrt(2 * math.pi) * torch.exp(-.5 * t**2)
+    def _Phi(self, t):
+        return .5 * (1 + torch.erf(t / math.sqrt(2)))
+    def _integrate_product_of_gaussians(self, mu, sigma_sq):
+        sigma = torch.sqrt(self.sigma ** 2 + sigma_sq)
+        return self._phi((mu - self.mu) / sigma) / sigma
+    def evaluate(self, t):
+        return self._phi((t - self.mu) / self.sigma) / self.sigma
+    def batch_evaluate(self, t):
+        t_ = t.repeat(self.mu.size(0),1) - self.mu.repeat(t.size(0),1).transpose(1,0)
+        t_ = t_ / self.sigma.repeat((t.size(0),1)).transpose(1,0)
+        return (self._phi(t_) / self.sigma.repeat((t.size(0),1)).transpose(1,0)).transpose(0,1)
+    def integrate_t2_times_psi(self, a, b):
+        """Compute integral int_a^b (t**2) * psi(t)."""
+        return (self.mu**2 + self.sigma**2) * (
+            self._Phi((b - self.mu) / self.sigma) - self._Phi((a - self.mu) / self.sigma)
+        ) - (
+            self.sigma * (b + self.mu) * self._phi((b - self.mu) / self.sigma)
+        ) + (
+            self.sigma * (a + self.mu) * self._phi((a - self.mu) / self.sigma)
+        )
+    def integrate_t_times_psi(self, a, b):
+        """Compute integral int_a^b t * psi(t)."""
+        return self.mu * (
+            self._Phi((b - self.mu) / self.sigma) - self._Phi((a - self.mu) / self.sigma)
+        ) - self.sigma * (
+            self._phi((b - self.mu) / self.sigma) - self._phi((a - self.mu) / self.sigma)
+        )
+    def integrate_psi(self, a, b):
+        """Compute integral int_a^b psi(t)."""
+        return self._Phi((b - self.mu) / self.sigma) - self._Phi((a - self.mu) / self.sigma)
+    def integrate_t2_times_psi_gaussian(self, mu, sigma_sq):
+        """Compute integral int N(t; mu, sigma_sq) * t**2 * psi(t)."""
+        S_tilde = self._integrate_product_of_gaussians(mu, sigma_sq)
+        mu_tilde = (
+            self.mu * sigma_sq + mu * self.sigma ** 2
+        ) / (
+            self.sigma ** 2 + sigma_sq
+        )
+        sigma_sq_tilde = ((self.sigma ** 2) * sigma_sq) / (self.sigma ** 2 + sigma_sq)
+        return S_tilde * (mu_tilde ** 2 + sigma_sq_tilde)
+    def integrate_t_times_psi_gaussian(self, mu, sigma_sq):
+        """Compute integral int N(t; mu, sigma_sq) * t * psi(t)."""
+        S_tilde = self._integrate_product_of_gaussians(mu, sigma_sq)
+        mu_tilde = (
+            self.mu * sigma_sq + mu * self.sigma ** 2
+        ) / (
+            self.sigma ** 2 + sigma_sq
+        )
+        return S_tilde * mu_tilde
+    def integrate_psi_gaussian(self, mu, sigma_sq):
+        """Compute integral int N(t; mu, sigma_sq) * psi(t)."""
+        return self._integrate_product_of_gaussians(mu, sigma_sq)
+class RetangularBasisFunctions(BasisFunctions):
+    """Function phi(t) = Gaussian(t; mu, sigma_sq)."""
+    def __init__(self, mu, sigma):
+        self.mu = mu.unsqueeze(0)
+        self.width = sigma.unsqueeze(0)
+    def __repr__(self):
+        return f"GaussianBasisFunction(mu={self.mu}, sigma={self.sigma})"
+    def __len__(self):
+        """Number of basis functions."""
+        return self.mu.size(1)
+    def batch_evaluate(self, t):
+        """
+        Evaluate multiple time points against all rectangular basis functions.
+        Args:
+            t: Tensor of time values to evaluate, shape (num_points,).
+        Returns:
+            Tensor of evaluations, shape (num_basis, num_points).
+        """
+        t = t.repeat(self.mu.size(0),1)  # Shape: (1, num_points)
+        mu = self.mu.repeat(t.size(0),1).transpose(1,0)  # Shape: (num_basis, 1)
+        width = self.width.repeat(t.size(0),1).transpose(1,0)  # Shape: (num_basis, 1)
+        return ((t >= (mu - width / 2)) & (t < (mu + width / 2))).float().transpose(0,1)
+    def _Phi(self, t):
+        """
+        Compute the step function for a single value of t.
+        Args:
+            t: A scalar or tensor of time values.
+        Returns:
+            Tensor of values indicating presence in each basis function's range.
+        """
+        lower_bounds = self.mu - self.width / 2
+        upper_bounds = self.mu + self.width / 2
+        return ((t >= lower_bounds) & (t < upper_bounds)).float()
+    def evaluate(self, t):
+        """
+        Evaluate the rectangular basis functions at a single point or array of points.
+        Args:
+            t: A scalar or 1D tensor of time values.
+        Returns:
+            Tensor of shape (num_basis,) for scalar input, or (num_basis, num_points) for tensor input.
+        """
+        if t.ndim == 0:  # Scalar input
+            return self._Phi(t)
+        else:  # Tensor input
+          # Shape: (1, num_points)
+            lower_bounds = (self.mu - self.width / 2)  # Shape: (num_basis, 1)
+            upper_bounds = (self.mu + self.width / 2)  # Shape: (num_basis, 1)
+            return ((t >= lower_bounds) & (t < upper_bounds)).float()

ltm/long_term_attention_gibbs.py ADDED Viewed

	@@ -0,0 +1,315 @@

+# coding: utf-8
+"""
+Attention modules
+"""
+import torch
+import torch.nn as nn
+import torch.distributions as dist
+from .basis_functions import (
+    PowerBasisFunctions,
+    SineBasisFunctions,
+    CosineBasisFunctions,
+    GaussianBasisFunctions,
+    RetangularBasisFunctions
+)
+import numpy as np
+class LongTermAttention(nn.Module):
+    def __init__(self, head_size:int , length: int, target_len:int,  attn_func: str, attn_num_basis: int,
+                  continuous: bool, attn_drop: float, infinite_memory: bool, n_layers: int,
+                  n_heads: int, affines: bool, mask: bool, mask_type: str, kl_regularizer: bool, proj_key, proj_value, sigma_0, mu_0, sticky_memories, sigmas, tau, **kwargs):
+        super(LongTermAttention, self).__init__()
+        self.device = 'cuda'
+        self.length = length #memory length
+        self.target_len = target_len #target length / transformer length
+        self.head_size = head_size
+        self.attn_num_basis = attn_num_basis
+        self.continuous = continuous # whether attention over memory vectors is continuous
+        self.attn_func = attn_func # normalizing function
+        self.n_head = n_heads
+        self.sigmas = sigmas
+        self.kl_regularizer = kl_regularizer
+        self.sigma_0 = sigma_0
+        self.mu_0 = mu_0
+        self.proj_key = proj_key
+        self.proj_value = proj_value
+        self.affines=affines # whether mu, sigma should be computed using affine transformations
+        self.sticky_memories=sticky_memories
+        self.mem_threshold=2048
+        self.infinite_memory = infinite_memory # whether the memory is infinite
+        self.nb_samples=512 # number of samples used for update
+        self.tau = tau #compressing factor
+        self.count = 0
+        self.x_past=None # previous memory vectors
+        self.B_past=None # previous coefficient matrix
+        self.ridge_penalty=0.5 # ridge penalty
+        self.padding = True
+        self.spacing='linear'
+    def get_basis(self, length, target_len):
+        def compute_G(l, psi, positions, padding=True):
+            F = torch.zeros(self.attn_num_basis, positions.size(0))
+            basis_functions = psi
+            F[:, :] = basis_functions.evaluate(positions.unsqueeze(1)).t()
+            I = torch.eye(self.attn_num_basis)
+            G = F.t().matmul((F.matmul(F.t()) + self.ridge_penalty * I).inverse())
+            if padding:
+                if l % 2:
+                    G = G[((l-1)//2):(-(l-1)//2), :]
+                else:
+                    G = G[(l//2):-(l//2), :]
+            return G.to(self.device)
+        padding = self.padding
+        attn_num_basis = self.attn_num_basis
+        if self.continuous:
+            self.psi=[None]
+            self.Gs=[None for _ in range(length+1)]
+            lengths=[]
+            for i in range(length):
+                self.psi.append([])
+                if (i+1)%target_len==0:
+                    lengths.append(i+1)
+            if length not in lengths:
+                lengths.append(length)
+            for l in lengths:
+                # get positions for memory vectors
+                self.add_retangular_basis_functions(self.psi[l], attn_num_basis, device=self.device)
+                if self.spacing=='linear':
+                    if padding:
+                        if l % 2:
+                            shift = 1 / float(l)
+                            positions = torch.linspace(-.5+shift, 1.5-shift, 2*l-1).to(self.device)
+                        else:
+                            shift = 1 / float(2*l)
+                            positions = torch.linspace(-.5+shift, 1.5-shift, 2*l).to(self.device)
+                    else:
+                        shift = 1 / float(2*l)
+                        positions = torch.linspace(shift, 1-shift, l).to(self.device)
+                elif self.spacing=='log':
+                    if padding:
+                        if l % 2:
+                            shift = 1 / float(l)
+                            positions = torch.linspace(-.5+shift, 1.5-shift, 2*l-1).to(self.device)
+                        else:
+                            shift = 1 / float(2*l)
+                            positions = torch.linspace(-.5+shift, 1.5-shift, 2*l).to(self.device)
+                        pos = np.e**(np.log(1+1)*torch.arange(1,length+1)/length)-1
+                        positions = torch.cat([positions[:int(l/2)],pos.to(self.device),positions[-int(l/2):]])
+                    else:
+                        positions = np.e**(np.log(1+1)*torch.arange(1,length+1)/length)-1
+                # compute basis functions
+                self.Gs[l]=compute_G(l, self.psi[l][0], positions, padding=padding) # [L,N]
+                self.positions = positions[int(l/2):-int(l/2)]
+            # compute samples for memory update
+            if self.infinite_memory:
+                tm_tau = torch.arange(1,self.nb_samples+1).float()
+                tm_l = torch.arange(self.nb_samples+1,length+self.nb_samples+1).float()
+                tm_tau = tm_tau*self.tau/self.nb_samples # positions of old vectors
+                tm_l = self.tau + (1-self.tau)*(tm_l-self.nb_samples)/length # positions of new vectors
+                positions_inf = torch.cat([tm_tau, tm_l],0).to(self.device) # positions
+                if padding:
+                    if l % 2:
+                        shift = 1 / float(length+self.nb_samples)
+                        positions_pad = torch.linspace(-.5+shift, 1.5-shift, 2*(length+self.nb_samples)-1).to(self.device)
+                    else:
+                        shift = 1 / float(2*length+self.nb_samples)
+                        positions_pad = torch.linspace(-.5+shift, 1.5-shift, 2*(length+self.nb_samples)).to(self.device)
+                    positions_pad_ = torch.FloatTensor([i for i in positions_pad if i<0]).to(self.device)
+                    positions_pad__ = torch.FloatTensor([i for i in positions_pad if i>1]).to(self.device)
+                    positions_inf = torch.cat([positions_pad_,positions_inf,positions_pad__], dim=0)
+                self.samples=None
+                for t in tm_tau:
+                    if self.samples is None:
+                        self.samples = self.psi[l][0].evaluate(t/self.tau)
+                    else:
+                        self.samples = torch.cat([self.samples,self.psi[l][0].evaluate(t/self.tau)], dim=0)
+                # compute G for the infinite case
+                self.G_inf = compute_G(self.nb_samples+length, self.psi[l][0], positions_inf, padding=padding) #[L+nb_samples,N]
+                if self.sticky_memories:
+                    self.bins = torch.linspace(0,1,129).to(device=self.device) #self.positions
+                    self.nb_bins_cat=1
+                    self.bins_cat = dist.Categorical(torch.ones(self.nb_bins_cat))
+    def add_gaussian_basis_functions(self, psi, nb_basis, sigmas, device):
+        mu, sigma = torch.meshgrid(torch.linspace(0, 1, nb_basis // len(sigmas)), torch.Tensor(sigmas))
+        mu = mu.flatten().to(device)
+        sigma = sigma.flatten().to(device)
+        self.basis_mu=mu
+        self.basis_sigma=sigma
+        assert mu.size(0) == nb_basis
+        psi.append(GaussianBasisFunctions(mu=mu, sigma=sigma))
+    def add_retangular_basis_functions(self, psi, nb_basis, device):
+        width = torch.ones(nb_basis, device=device) / nb_basis
+        # Compute the centers (midpoints) of each bin
+        edges = torch.linspace(0, 1, nb_basis + 1, device=device)
+        mu = (edges[:-1] + edges[1:]) / 2
+        psi.append(RetangularBasisFunctions(mu=mu, sigma=width))
+    def value_function(self, x, inf=False):
+        if inf:
+            G = self.G_inf # [nb_sample+L,N]
+        else:
+            G = self.Gs[x.size(-1)] # [L,N]
+        B = torch.matmul(x, G) # [B,e,N]
+        B = B.permute(0,2,1) # [B,N,e]
+        return B
+    def update_inf(self, x):
+        if self.B_past is not None:
+            if self.sticky_memories:
+                bins = self.bins.clone()
+                bins[0]=-.000001
+                bins[-1]=1.000001
+                prob_density = self.compute_probability(self.score, t=bins)
+                cum_prob = torch.cumulative_trapezoid(prob_density, bins, dim=-1).to(self.device)
+                p = (cum_prob[..., 1:] - cum_prob[..., :-1]).sum(dim=(1, 2))
+                p = p / p.sum(-1, keepdim=True)  # Normalize over the last dimension (bins)
+                p = dist.Categorical(p)
+                b = p.sample((self.nb_samples,))
+                t = self.bins_cat.sample((self.nb_samples, 1)).to(device=self.device)
+                ts = (t*(self.bins[b+1]-self.bins[b])/self.nb_bins_cat +self.bins[b]).transpose(1,0)
+                samples = self.psi[self.length][0].batch_evaluate(ts[0]).contiguous()
+                xm_tau = self.B_past.transpose(-1,-2).matmul(samples.transpose(-1,-2)) # [B,e,nb_samples]
+            else:
+                xm_tau = self.B_past.transpose(-1,-2).matmul(self.samples.transpose(-1,-2)) # [B,e,nb_samples]
+            x = torch.cat([xm_tau,x], dim=2) # [B,e,nb_samples+L]
+            B = self.value_function(x, inf=True) # [B,N,e]
+        else:
+            B = self.value_function(x)
+        self.B_past=B.detach()
+        self.x_past=x
+        return B
+    def score(self, t):
+        psis = self.psis[0].batch_evaluate(t)
+        query = self.queries/ (self.d_head ** 0.5) # divide by sqrt(d_head) [B,h,q,d]
+        keys = self.keys.transpose(-1, -2)
+        keys = torch.matmul(keys, psis.T) #[B,h,d,1]
+        scores = torch.matmul(query, keys) #[B,h,q,1]
+        return scores
+    def compute_probability(self, score_fn, num_points=1000, t=None):
+        """
+        Compute probability distribution p(t).
+        Args:
+            score_fn (callable): Function that computes z(t)
+            num_points (int): Number of points for numerical integration
+        Returns:
+            tuple: (probabilities, normalization constant)
+        """
+        if t is None:
+            # Create integration points
+            t = torch.linspace(0, 1, num_points).to(self.device)
+        scores = score_fn(t)
+        prob = torch.exp(scores) / torch.trapz(torch.exp(scores), t, dim=-1).unsqueeze(-1)
+        return prob
+    def expected_value(self, score_fn, num_points=1000):
+        """
+        Compute expected value E_p[V(t)] using nested integration.
+        Args:
+            score_fn (callable): Function that computes z(t)
+            value_fn (callable): Function that computes v(t)
+            num_points (int): Number of points for numerical integration
+        Returns:
+            torch.Tensor: Expected value
+        """
+        # Create integration points
+        t = torch.linspace(0, 1, num_points).to(self.device)
+        # Compute basis functions
+        self.psis = []
+        self.add_retangular_basis_functions(self.psis, self.attn_num_basis, self.device)
+        psi = self.psis[0].batch_evaluate(t)
+        # Compute probability distribution
+        prob = self.compute_probability(score_fn, num_points)
+        # Compute values at integration points
+        values = self.values
+        # Compute p(t) * psi(t)
+        # Reshape psi for broadcasting to match the shape of prob
+        psi_broadcasted = psi.unsqueeze(1).unsqueeze(2).unsqueeze(3)
+        # Expand psi to match the dimensions of prob (num_points, batch_size, n_head, qlen, 256)
+        psi_broadcasted = psi_broadcasted.expand(num_points, self.batch_size, self.n_head, self.qlen, self.attn_num_basis)
+        integrand = torch.matmul(prob.permute(3,0,1,2).unsqueeze(-1).unsqueeze(-1), psi_broadcasted.unsqueeze(-2)).permute(1, 2, 3, 4, 5, 0).squeeze(-3)
+        integral  = torch.trapz(integrand, t, dim=-1)
+        # Matrix multiply with values
+        expected_value = torch.matmul(integral, values)  # [B, h, q, d]
+        return expected_value
+    def forward(self, k, q, new_doc, layer_n):
+        self.device = k.device
+        if self.continuous:
+            klen = int(k.size(1)/(14*14))
+            self.length = klen
+            batch_size = k.size(0) #batch size
+            qlen = q.size(1) #query length
+            self.qlen = qlen
+            self.batch_size = batch_size
+            self.d_head = self.head_size #head size
+            self.get_basis(klen, klen)
+            # clean memory if going through different document
+            if new_doc:
+                self.B_past=None
+                self.x_past=None
+            k = k.reshape(batch_size, klen, 14, 14, 1024).mean(dim=(2, 3))
+            k = k.transpose(1,2)
+            # perform memory update
+            if self.infinite_memory:
+                B = self.update_inf(k)
+            else: # compute input continuous approximation
+                B = self.value_function(k) # [B,N,e]
+            keys = self.proj_key(B)
+            values = self.proj_value(B)
+            query = q
+            self.queries = query.view(batch_size,qlen,self.n_head,self.d_head).transpose(1,2) # [B,h,q,d]
+            self.keys = keys.view(batch_size,self.attn_num_basis,self.n_head,self.d_head).transpose(1,2) # [B,h,N,d]
+            self.values = values.view(batch_size,self.attn_num_basis,self.n_head,self.d_head).transpose(1,2) # [B, h, q, N]
+            context = self.expected_value(self.score)  # Shape [1, 32, 768]
+            return context.contiguous().transpose(1,2).reshape(1, qlen, -1)

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ecf6a804c5af89465362453e591d8c3358cd97ad48247baabfc5b070edad2e07
+size 4971600800

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf5745ae7b321d884e62f74589758abee57e79c6ae138e1b1f6877b5cad20565
+size 4915917440

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:475cbd791fe87314409771c7f9651e5f7237c43e8eb5d9662714ff1d3d4fbc04
+size 4999820720

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7648e67deaaa08ce6f73df0f96963c62dba9702927390a73c69bdc328d6f5d27
+size 1499540784

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,934 @@

+{
+  "metadata": {
+    "total_size": 16386763776
+  },
+  "weight_map": {
+    "extra_query_tokens": "model-00001-of-00004.safetensors",
+    "mistral_model.lm_head.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.29.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.29.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.29.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.29.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.29.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "mistral_model.model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.30.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.30.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.30.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.30.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.30.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.30.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.30.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.30.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.30.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.31.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.31.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.31.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.31.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.31.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.31.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "mistral_model.model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "mistral_model.model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "mistral_model.model.norm.weight": "model-00004-of-00004.safetensors",
+    "mistral_proj.bias": "model-00004-of-00004.safetensors",
+    "mistral_proj.weight": "model-00004-of-00004.safetensors",
+    "qformer.bert.embeddings.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.embeddings.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.embeddings.position_embeddings.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.embeddings.position_ids": "model-00001-of-00004.safetensors",
+    "qformer.bert.embeddings.word_embeddings.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.attention.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.attention.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.attention.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.attention.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.attention.self.key.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.attention.self.key.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.attention.self.query.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.attention.self.query.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.attention.self.value.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.attention.self.value.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.crossattention.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.crossattention.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.crossattention.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.crossattention.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.crossattention.self.key.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.crossattention.self.key.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.crossattention.self.query.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.crossattention.self.query.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.crossattention.self.value.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.crossattention.self.value.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.intermediate.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.intermediate.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.intermediate_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.intermediate_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.output_query.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.output_query.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.output_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.0.output_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.attention.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.attention.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.attention.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.attention.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.attention.self.key.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.attention.self.key.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.attention.self.query.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.attention.self.query.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.attention.self.value.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.attention.self.value.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.intermediate.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.intermediate.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.intermediate_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.intermediate_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.output_query.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.output_query.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.output_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.1.output_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.attention.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.attention.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.attention.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.attention.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.attention.self.key.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.attention.self.key.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.attention.self.query.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.attention.self.query.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.attention.self.value.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.attention.self.value.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.crossattention.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.crossattention.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.crossattention.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.crossattention.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.crossattention.self.key.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.crossattention.self.key.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.crossattention.self.query.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.crossattention.self.query.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.crossattention.self.value.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.crossattention.self.value.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.intermediate.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.intermediate.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.intermediate_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.intermediate_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.output_query.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.output_query.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.output_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.10.output_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.attention.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.attention.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.attention.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.attention.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.attention.self.key.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.attention.self.key.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.attention.self.query.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.attention.self.query.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.attention.self.value.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.attention.self.value.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.intermediate.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.intermediate.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.intermediate_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.intermediate_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.output_query.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.output_query.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.output_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.11.output_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.attention.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.attention.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.attention.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.attention.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.attention.self.key.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.attention.self.key.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.attention.self.query.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.attention.self.query.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.attention.self.value.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.attention.self.value.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.crossattention.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.crossattention.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.crossattention.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.crossattention.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.crossattention.self.key.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.crossattention.self.key.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.crossattention.self.query.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.crossattention.self.query.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.crossattention.self.value.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.crossattention.self.value.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.intermediate.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.intermediate.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.intermediate_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.intermediate_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.output_query.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.output_query.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.output_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.2.output_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.attention.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.attention.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.attention.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.attention.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.attention.self.key.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.attention.self.key.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.attention.self.query.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.attention.self.query.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.attention.self.value.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.attention.self.value.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.intermediate.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.intermediate.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.intermediate_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.intermediate_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.output_query.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.output_query.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.output_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.3.output_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.attention.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.attention.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.attention.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.attention.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.attention.self.key.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.attention.self.key.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.attention.self.query.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.attention.self.query.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.attention.self.value.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.attention.self.value.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.crossattention.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.crossattention.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.crossattention.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.crossattention.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.crossattention.self.key.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.crossattention.self.key.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.crossattention.self.query.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.crossattention.self.query.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.crossattention.self.value.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.crossattention.self.value.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.intermediate.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.intermediate.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.intermediate_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.intermediate_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.output_query.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.output_query.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.output_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.4.output_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.attention.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.attention.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.attention.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.attention.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.attention.self.key.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.attention.self.key.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.attention.self.query.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.attention.self.query.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.attention.self.value.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.attention.self.value.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.intermediate.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.intermediate.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.intermediate_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.intermediate_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.output_query.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.output_query.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.output_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.5.output_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.attention.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.attention.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.attention.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.attention.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.attention.self.key.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.attention.self.key.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.attention.self.query.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.attention.self.query.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.attention.self.value.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.attention.self.value.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.crossattention.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.crossattention.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.crossattention.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.crossattention.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.crossattention.self.key.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.crossattention.self.key.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.crossattention.self.query.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.crossattention.self.query.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.crossattention.self.value.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.crossattention.self.value.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.intermediate.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.intermediate.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.intermediate_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.intermediate_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.output_query.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.output_query.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.output_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.6.output_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.attention.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.attention.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.attention.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.attention.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.attention.self.key.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.attention.self.key.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.attention.self.query.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.attention.self.query.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.attention.self.value.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.attention.self.value.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.intermediate.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.intermediate.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.intermediate_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.intermediate_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.output_query.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.output_query.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.output_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.7.output_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.attention.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.attention.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.attention.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.attention.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.attention.self.key.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.attention.self.key.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.attention.self.query.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.attention.self.query.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.attention.self.value.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.attention.self.value.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.crossattention.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.crossattention.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.crossattention.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.crossattention.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.crossattention.self.key.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.crossattention.self.key.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.crossattention.self.query.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.crossattention.self.query.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.crossattention.self.value.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.crossattention.self.value.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.intermediate.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.intermediate.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.intermediate_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.intermediate_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.output_query.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.output_query.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.output_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.8.output_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.attention.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.attention.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.attention.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.attention.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.attention.self.key.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.attention.self.key.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.attention.self.query.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.attention.self.query.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.attention.self.value.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.attention.self.value.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.intermediate.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.intermediate.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.intermediate_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.intermediate_query.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.output.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.output.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.output.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.output.dense.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.output_query.LayerNorm.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.output_query.LayerNorm.weight": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.output_query.dense.bias": "model-00001-of-00004.safetensors",
+    "qformer.bert.encoder.layer.9.output_query.dense.weight": "model-00001-of-00004.safetensors",
+    "query_tokens": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.0.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.0.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.0.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.0.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.0.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.0.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.0.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.0.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.0.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.0.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.0.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.0.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.1.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.1.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.1.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.1.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.1.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.1.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.1.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.1.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.1.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.1.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.1.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.1.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.1.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.10.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.10.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.10.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.10.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.10.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.10.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.10.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.10.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.10.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.10.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.10.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.10.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.11.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.11.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.11.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.11.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.11.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.11.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.11.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.11.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.11.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.11.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.11.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.11.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.11.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.12.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.12.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.12.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.12.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.12.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.12.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.12.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.12.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.12.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.12.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.12.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.12.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.12.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.13.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.13.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.13.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.13.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.13.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.13.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.13.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.13.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.13.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.13.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.13.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.13.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.13.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.14.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.14.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.14.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.14.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.14.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.14.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.14.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.14.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.14.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.14.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.14.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.14.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.14.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.15.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.15.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.15.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.15.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.15.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.15.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.15.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.15.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.15.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.15.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.15.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.15.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.15.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.16.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.16.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.16.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.16.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.16.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.16.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.16.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.16.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.16.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.16.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.16.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.16.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.16.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.17.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.17.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.17.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.17.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.17.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.17.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.17.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.17.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.17.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.17.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.17.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.17.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.17.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.18.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.18.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.18.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.18.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.18.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.18.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.18.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.18.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.18.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.18.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.18.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.18.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.18.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.19.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.19.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.19.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.19.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.19.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.19.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.19.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.19.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.19.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.19.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.19.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.19.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.19.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.2.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.2.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.2.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.2.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.2.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.2.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.2.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.2.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.2.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.2.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.2.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.2.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.2.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.20.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.20.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.20.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.20.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.20.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.20.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.20.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.20.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.20.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.20.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.20.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.20.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.21.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.21.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.21.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.21.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.21.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.21.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.21.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.21.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.21.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.21.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.21.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.21.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.21.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.22.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.22.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.22.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.22.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.22.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.22.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.22.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.22.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.22.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.22.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.22.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.22.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.3.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.3.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.3.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.3.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.3.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.3.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.3.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.3.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.3.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.3.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.3.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.3.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.3.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.4.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.4.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.4.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.4.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.4.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.4.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.4.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.4.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.4.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.4.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.4.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.4.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.4.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.5.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.5.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.5.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.5.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.5.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.5.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.5.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.5.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.5.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.5.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.5.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.5.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.5.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.6.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.6.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.6.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.6.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.6.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.6.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.6.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.6.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.6.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.6.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.6.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.6.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.6.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.7.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.7.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.7.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.7.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.7.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.7.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.7.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.7.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.7.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.7.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.7.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.7.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.8.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.8.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.8.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.8.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.8.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.8.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.8.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.8.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.8.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.8.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.8.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.8.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.8.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.9.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.9.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.9.attn.q_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.9.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.9.attn.v_bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.9.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.9.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.9.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.9.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.9.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.9.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.9.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.blocks.9.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.patch_embed.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_encoder.encoder.patch_embed.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_layernorm.bias": "model-00001-of-00004.safetensors",
+    "vision_layernorm.weight": "model-00001-of-00004.safetensors"
+  }
+}

videochat2_it_hd_mistral.py ADDED Viewed

	@@ -0,0 +1,418 @@

+import logging
+import torch
+from torch.cuda.amp import autocast as autocast
+import torch.nn as nn
+import torch.nn.functional as F
+from peft import get_peft_model, LoraConfig, TaskType
+from .blip2 import Blip2Base, disabled_train
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, PretrainedConfig
+logger = logging.getLogger(__name__)
+from easydict import EasyDict
+from .configuration_videochat2 import Config
+class VideoChat2_it_hd_mistral(Blip2Base):
+    _auto_class='AutoModel'
+    config_class=Config
+    """
+    VideoChat2 model.
+    """
+    def __init__(self, config):
+        super().__init__()
+        # pretrained_path
+        self.config=config
+        if isinstance(config,(PretrainedConfig,AutoConfig)):
+            if hasattr(config,'cfg'): # my own cfg
+                config=EasyDict(config.cfg)
+            else:
+                config=EasyDict(config.to_dict())
+        pc=PretrainedConfig()
+        pc.update(config)
+        vit_blip_model_path = config.get("vit_blip_model_path", None)
+        mistral_model_path = config.get("mistral_model_path")
+        videochat2_model_path = config.get("videochat2_model_path", "")
+        freeze_vit = config.get("freeze_vit", True)
+        freeze_qformer = config.get("freeze_qformer", True)
+        freeze_llm = config.get("freeze_llm", True)
+        # vit
+        low_resource = config.get("low_resource", False) # use 8 bit and put vit in cpu
+        # qformer
+        num_query_token = config.get("num_query_token")
+        qformer_hidden_dropout_prob = config.get("qformer_hidden_dropout_prob", 0.1)
+        qformer_attention_probs_dropout_prob = config.get("qformer_attention_probs_dropout_prob", 0.1)
+        qformer_drop_path_rate = config.get("qformer_drop_path_rate", 0.1)
+        extra_num_query_token = config.get("extra_num_query_token", 32)
+        self.qformer_text_input = config.get("qformer_text_input", False)
+        # Infinite-Video related hyperparameters
+        num_basis =  config.get("num_basis", 256)
+        sticky =  config.get("sticky", True)
+        tau =  config.get("tau", 0.75)
+        alpha = config.get("alpha", 0.75)
+        # prompt
+        max_txt_len = config.get("max_txt_len", 32)
+        self.human_start = "[INST]"
+        self.human_end = "[/INST]"
+        self.assist_end = "</s>"
+        self.start_token = config.get("start_token", "<Video>")
+        self.end_token = config.get("end_token", "</Video>")
+        self.img_start_token = config.get("img_start_token", "<Image>")
+        self.img_end_token = config.get("img_end_token", "</Image>")
+        logger.info(f"Add instruction in qformer: {self.qformer_text_input}")
+        # debug
+        self.debug = config.get("debug", False)
+        self.llm_bf16 = config.get("llm_bf16", False)
+        use_flash_attention = config.get("use_flash_attention", False)
+        self.use_lora = config.get("use_lora", False)
+        lora_r = config.get("lora_r", 8)
+        lora_alpha = config.get("lora_alpha", 32)
+        lora_dropout = config.get("lora_dropout", 0.05)
+        # dynamic resolution
+        self.local_size = config.dynamic_config.get("local_size", 224)
+        self.add_global = config.dynamic_config.get("add_global", True)
+        self.tokenizer = self.init_tokenizer(truncation_side="left")
+        self.tokenizer.padding_side = "left"
+        self.low_resource = low_resource
+        self.vision_encoder, self.vision_layernorm = self.init_vision_encoder_umt(config)
+        self.qformer, self.query_tokens = self.init_Qformer(
+            num_query_token, config.vision_encoder.encoder_embed_dim,
+            qformer_hidden_dropout_prob=qformer_hidden_dropout_prob,
+            qformer_attention_probs_dropout_prob=qformer_attention_probs_dropout_prob,
+            qformer_drop_path_rate=qformer_drop_path_rate,
+            num_basis=num_basis, alpha=alpha, tau=tau, sticky=sticky,
+        )
+        if not self.qformer_text_input:
+            self.qformer.bert.embeddings.word_embeddings = None
+            self.qformer.bert.embeddings.position_embeddings = None
+            for layer in self.qformer.bert.encoder.layer:
+                layer.output = None
+                layer.intermediate = None
+        else:
+            self.qformer.resize_token_embeddings(len(self.tokenizer))
+        self.qformer.cls = None
+        if vit_blip_model_path:
+            logger.info(f"Load ViT and QFormer from {vit_blip_model_path}")
+            state_dict = torch.load(vit_blip_model_path, map_location="cpu")
+            msg = self.load_state_dict(state_dict, strict=False)
+            logger.info(msg)
+            logger.info('Loading ViT and Q-Former Done')
+        self.extra_num_query_token = extra_num_query_token
+        if extra_num_query_token > 0:
+            logger.info(f"Add extra {extra_num_query_token} tokens in QFormer")
+            self.extra_query_tokens = nn.Parameter(
+                torch.zeros(1, extra_num_query_token, self.query_tokens.shape[-1])
+            )
+        if freeze_vit:
+            logger.info("freeze vision encoder")
+            for _, param in self.vision_encoder.named_parameters():
+                param.requires_grad = False
+            self.vision_encoder = self.vision_encoder.eval()
+            self.vision_encoder.train = disabled_train
+            for _, param in self.vision_layernorm.named_parameters():
+                param.requires_grad = False
+            self.vision_layernorm = self.vision_layernorm.eval()
+            self.vision_layernorm.train = disabled_train
+        if freeze_qformer:
+            logger.info("freeze Qformer")
+            for _, param in self.qformer.named_parameters():
+                param.requires_grad = False
+            self.qformer = self.qformer.eval()
+            self.qformer.train = disabled_train
+            self.query_tokens.requires_grad = False
+        logger.info('Loading Mistral')
+        self.mistral_tokenizer = AutoTokenizer.from_pretrained(mistral_model_path)
+        self.mistral_tokenizer.padding_side = "left"
+        if not self.mistral_tokenizer.pad_token:
+            logger.info("Set pad_token")
+            self.mistral_tokenizer.pad_token = self.mistral_tokenizer.eos_token
+        if self.debug:
+            logger.info("Debug mode, build small Mistral")
+            mistral_config = AutoConfig.from_pretrained(mistral_model_path)
+            mistral_config.hidden_size = 512
+            mistral_config.intermediate_size = 2048
+            mistral_config.num_attention_heads = 8
+            mistral_config.num_hidden_layers = 12
+            mistral_config.torch_dtype = torch.float16
+            self.mistral_model = AutoModelForCausalLM.from_config(mistral_config)
+        else:
+            if use_flash_attention:
+                self.mistral_model = AutoModelForCausalLM.from_pretrained(
+                    mistral_model_path,
+                    torch_dtype=torch.bfloat16 if self.llm_bf16 else torch.float16,
+                    # use_flash_attention_2=True,
+                    attn_implementation="flash_attention_2",
+                )
+            else:
+                self.mistral_model = AutoModelForCausalLM.from_pretrained(
+                    mistral_model_path,
+                    torch_dtype=torch.bfloat16 if self.llm_bf16 else torch.float16,
+                )
+        if freeze_llm:
+            logger.info("freeze Mistral")
+            for _, param in self.mistral_model.named_parameters():
+                param.requires_grad = False
+        logger.info('Loading Mistral Done')
+        if self.use_lora:
+            logger.info("Use lora")
+            peft_config = LoraConfig(
+                task_type=TaskType.CAUSAL_LM, inference_mode=False,
+                r=lora_r, lora_alpha=lora_alpha, lora_dropout=lora_dropout,
+                target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                                 "gate_proj", "up_proj", "down_proj", "lm_head"]
+            )
+            self.mistral_model = get_peft_model(self.mistral_model, peft_config)
+            if not freeze_llm:
+                logger.info("Unfreeze Mistral")
+                for _, param in self.mistral_model.base_model.named_parameters():
+                    param.requires_grad = True
+            self.mistral_model.print_trainable_parameters()
+        self.mistral_proj = nn.Linear(
+            self.qformer.config.hidden_size, self.mistral_model.config.hidden_size
+        )
+        self.max_txt_len = max_txt_len
+        # load weights of VideoChat2
+        if videochat2_model_path:
+            logger.info(f"Load VideoChat2 from: {videochat2_model_path}")
+            ckpt = torch.load(videochat2_model_path, map_location="cpu")
+            if 'model' in ckpt.keys():
+                msg = self.load_state_dict(ckpt['model'], strict=False)
+            else:
+                msg = self.load_state_dict(ckpt, strict=False)
+            logger.info(msg)
+        self.config=pc
+    def vit_to_cpu(self):
+        self.vision_layernorm.to("cpu")
+        self.vision_layernorm.float()
+        self.vision_encoder.to("cpu")
+        self.vision_encoder.float()
+    def encode_img(self, image, instruction, new_video=False):
+        device = image[0].device
+        if self.low_resource:
+            self.vit_to_cpu()
+            image = [img.to("cpu") for img in image]
+        with self.maybe_autocast():
+            # split the image or video according to the shape
+            shapes = []
+            input_imgs = []
+            input_instructions = []
+            for idx, img in enumerate(image):
+                # logger.info(f"Input shape: {img.shape}")
+                T, C, H, W = img.shape
+                shapes.append([H//self.local_size, W//self.local_size])
+                sub_img = img.reshape(
+                    1, T, 3, H//self.local_size, self.local_size, W//self.local_size, self.local_size
+                ).permute(0, 3, 5, 1, 2, 4, 6).reshape(-1, T, 3, self.local_size, self.local_size).contiguous()
+                input_imgs.append(sub_img)
+                input_instructions.extend([instruction[idx]] * len(sub_img))
+                if self.add_global:
+                    glb_img = F.interpolate(
+                        img.float(), size=(self.local_size, self.local_size), mode='bicubic', align_corners=False
+                    ).to(sub_img.dtype)
+                    input_imgs.append(glb_img.unsqueeze(0))
+                    input_instructions.append(instruction[idx])
+            input_imgs = torch.cat(input_imgs, dim=0)
+            T = input_imgs.shape[1]
+            use_image = True if T == 1 else False
+            input_imgs = input_imgs.permute(0, 2, 1, 3, 4) # [B,T,C,H,W] -> [B,C,T,H,W]
+            image_embeds = self.vision_encoder(input_imgs, use_image)
+            B, T, L, C = image_embeds.shape
+            image_embeds = image_embeds.reshape(B, -1, C)
+            image_embeds = self.vision_layernorm(image_embeds).to(device)  # [B, T*L, C]
+            image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device)
+            if self.extra_num_query_token > 0:
+                query_tokens = torch.cat([self.query_tokens, self.extra_query_tokens], dim=1)
+            else:
+                query_tokens = self.query_tokens
+            query_tokens = query_tokens.expand(image_embeds.shape[0], -1, -1)
+            if self.qformer_text_input:
+                text_Qformer = self.tokenizer(
+                    input_instructions,
+                    padding='longest',
+                    truncation=True,
+                    max_length=self.max_txt_len,
+                    return_tensors="pt",
+                ).to(image_embeds.device)
+                query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(image_embeds.device)
+                Qformer_atts = torch.cat([query_atts, text_Qformer.attention_mask], dim=1)
+                query_output = self.qformer.bert(
+                    text_Qformer.input_ids,
+                    attention_mask=Qformer_atts,
+                    query_embeds=query_tokens,
+                    encoder_hidden_states=image_embeds,
+                    encoder_attention_mask=image_atts,
+                    return_dict=True,
+                    new_video=new_video,
+                )
+            else:
+                query_output = self.qformer.bert(
+                    query_embeds=query_tokens,
+                    encoder_hidden_states=image_embeds,
+                    encoder_attention_mask=image_atts,
+                    return_dict=True,
+                    new_video=new_video
+                )
+            qformer_features = self.mistral_proj(query_output.last_hidden_state[:, :query_tokens.size(1), :])
+            q_C = qformer_features.shape[-1]
+            # merge the features from different split
+            # stolen from https://huggingface.co/internlm/internlm-xcomposer2-4khd-7b/blob/main/build_mlp.py#L97-L115
+            output_imgs = []
+            output_len = []
+            for [h, w] in shapes:
+                B_ = h * w
+                if self.add_global:
+                    output_imgs.append(qformer_features[:B_+1].view(1, -1, q_C))
+                    qformer_features = qformer_features[B_+1:]
+                else:
+                    output_imgs.append(qformer_features[:B_].view(1, -1, q_C))
+                    qformer_features = qformer_features[B_:]
+                # logger.info(f"Features shape: {output_imgs[-1].shape}")
+                output_len.append(output_imgs[-1].shape[1])
+        return output_imgs, output_len, use_image
+    def _get_text_len(self, text):
+        return self.mistral_tokenizer(text, return_tensors="pt", add_special_tokens=False).input_ids.shape[1]
+    def forward(self, image, text_input, instruction):
+        if len(image[0].shape) == 1:
+            use_text = True
+            device = image[0].device
+            batch_size = len(image)
+            img_lens = [0] * batch_size
+        else:
+            use_text = False
+            img_embeds, img_lens, use_image = self.encode_img(image, instruction)
+            device = img_embeds[0].device
+            batch_size = len(img_embeds)
+        # mark the largest length
+        # when padding, the attention mask will be 0
+        max_len = 0
+        input_embed_list = []
+        p_before_len_list = []
+        target_list = []
+        # handle each prompt individually
+        for idx, prompt in enumerate(text_input):
+            if use_text:
+                p_after = prompt
+                p_after_tokens = self.mistral_tokenizer(p_after, return_tensors="pt", add_special_tokens=False).to(device)
+                if self.use_lora:
+                    p_after_embeds = self.mistral_model.base_model.model.model.embed_tokens(p_after_tokens.input_ids)
+                else:
+                    p_after_embeds = self.mistral_model.model.embed_tokens(p_after_tokens.input_ids)
+                input_embeds = p_after_embeds
+            else:
+                tmp_img_embeds = img_embeds[idx]
+                # split the prompt via END_TOKEN
+                end_token = self.img_end_token if use_image else self.end_token
+                p_before, p_after = prompt.split(end_token)
+                p_after = end_token + p_after
+                p_before_tokens = self.mistral_tokenizer(p_before, return_tensors="pt", add_special_tokens=False).to(tmp_img_embeds.device)
+                p_after_tokens = self.mistral_tokenizer(p_after, return_tensors="pt", add_special_tokens=False).to(tmp_img_embeds.device)
+                if self.use_lora:
+                    p_before_embeds = self.mistral_model.base_model.model.model.embed_tokens(p_before_tokens.input_ids)
+                    p_after_embeds = self.mistral_model.base_model.model.model.embed_tokens(p_after_tokens.input_ids)
+                else:
+                    p_before_embeds = self.mistral_model.model.embed_tokens(p_before_tokens.input_ids)
+                    p_after_embeds = self.mistral_model.model.embed_tokens(p_after_tokens.input_ids)
+                input_embeds = torch.cat([p_before_embeds, tmp_img_embeds, p_after_embeds], dim=1)
+            # extract the answers and mask the target
+            # the answers are only in the p_after
+            sep1 = self.human_start + " "
+            sep2 = " " + self.human_end + " "
+            raw_text = p_after.split(sep2)
+            for idx in range(0, len(raw_text) - 1):
+                raw_text[idx] = raw_text[idx] + sep2
+            # the first raw_text contains system and question
+            # the last raw_text only contains answer
+            # rstrip() for the extra " "
+            answer_targets = p_after_tokens.input_ids.clone()
+            # [target] "xxxxx. </s>"
+            cur_len = self._get_text_len(raw_text[0].rstrip())
+            answer_targets[:, :cur_len] = -100
+            for text in raw_text[1:-1]:
+                total_len = self._get_text_len(text.rstrip())
+                ans_len = self._get_text_len((text.split(sep1)[0]).rstrip())
+                answer_targets[:, (cur_len+ans_len):(cur_len+total_len)] = -100
+                cur_len += total_len
+            cur_len += self._get_text_len(raw_text[-1].rstrip())
+            if self.debug:  # Inspect and check the correctness of masking
+                z = answer_targets[0].clone()
+                z = torch.where(z == -100, self.mistral_tokenizer.unk_token_id, z)
+                logger.info(self.mistral_tokenizer.decode(z))
+            assert cur_len == answer_targets.shape[1], f"The final length ({cur_len}) is not equal to the original prompt ({answer_targets.shape[1]}): {prompt}"
+            max_len = max(max_len, input_embeds.shape[1])
+            input_embed_list.append(input_embeds)
+            if use_text:
+                p_before_len_list.append(0)
+            else:
+                p_before_len_list.append(p_before_tokens.input_ids.shape[1])
+            target_list.append(answer_targets)
+        # plus one for bos
+        # max_txt_len plus num_query_token is the max len
+        txt_len = min(max_len + 1, self.max_txt_len + max(img_lens))
+        inputs_embeds = torch.ones([batch_size, txt_len], dtype=torch.long).to(device) * self.mistral_tokenizer.pad_token_id
+        if self.use_lora:
+            inputs_embeds = self.mistral_model.base_model.model.model.embed_tokens(inputs_embeds)
+        else:
+            inputs_embeds = self.mistral_model.model.embed_tokens(inputs_embeds)
+        attention_mask = torch.zeros([batch_size, txt_len], dtype=torch.long).to(device)
+        targets = torch.ones([batch_size, txt_len], dtype=torch.long).to(device).fill_(-100)
+        # set bos_token
+        inputs_embeds[:, :1] = self.mistral_tokenizer.bos_token_id
+        for idx in range(batch_size):
+            input_len = min(input_embed_list[idx].shape[1], txt_len - 1)
+            # if less than txt_len, the input will be padding
+            # if more than txt_len, the input will be truncated
+            inputs_embeds[idx, 1:(input_len+1)] = input_embed_list[idx][:, :input_len]
+            # the attention_mask is 0 when padding
+            attention_mask[idx, :(input_len+1)] = 1
+            # the target is -100 when padding
+            p_before_len = p_before_len_list[idx]
+            targets[idx, (p_before_len+img_lens[idx]+1):(input_len+1)] = target_list[idx][0, :(input_len-p_before_len-img_lens[idx])]
+        with self.maybe_autocast():
+            outputs = self.mistral_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                return_dict=True,
+                labels=targets,
+                use_cache=False, # current flash_attn2 dows not support padding=right for mistral
+            )
+        return dict(
+            loss=outputs.loss,
+        )

vit.py ADDED Viewed

	@@ -0,0 +1,472 @@

+import logging
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from functools import partial
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+logger = logging.getLogger(__name__)
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 400, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic',
+        'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
+        **kwargs
+    }
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., attn_head_dim=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
+        # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
+                 attn_head_dim=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        if init_values > 0:
+            self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
+            self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+    def forward(self, x):
+        if self.gamma_1 is None:
+            x = x + self.drop_path(self.attn(self.norm1(x)))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x)))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, num_frames=16, tubelet_size=2):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.tubelet_size = int(tubelet_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (num_frames // self.tubelet_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv3d(
+            in_channels=in_chans, out_channels=embed_dim,
+            kernel_size=(self.tubelet_size, patch_size[0], patch_size[1]),
+            stride=(self.tubelet_size, patch_size[0], patch_size[1])
+        )
+        logger.info(f'Num of patches: {num_patches}')
+    def forward(self, x, **kwargs):
+        B, C, T, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        # assert H == self.img_size[0] and W == self.img_size[1], \
+        #     f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+# sin-cos position encoding
+# https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Models.py#L31
+def get_sinusoid_encoding_table(n_position, d_hid, ckpt_num_frame=-1, cur_frame=12):
+    ''' Sinusoid position encoding table '''
+    # TODO: make it with torch instead of numpy
+    def get_position_angle_vec(position):
+        return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+    if ckpt_num_frame != -1 and ckpt_num_frame != cur_frame:
+        logger.info(f"Interpolate position embedding")
+        logger.info(f"Testing frame: {cur_frame}")
+        logger.info(f"Checkpoint frame: {ckpt_num_frame}")
+        T = ckpt_num_frame # checkpoint frame
+        new_T = cur_frame # testing frame
+        n_position = n_position // new_T * T # generate checkpoint position embedding
+        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
+        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
+        sinusoid_table = torch.tensor(sinusoid_table, dtype=torch.float, requires_grad=False).unsqueeze(0)
+        # interpolate
+        P = int((n_position // T) ** 0.5)
+        C = d_hid
+        sinusoid_table = sinusoid_table.reshape(-1, T, P, P, C)
+        sinusoid_table = sinusoid_table.permute(0, 2, 3, 4, 1).reshape(-1, C, T)  # BHW, C, T
+        sinusoid_table = torch.nn.functional.interpolate(sinusoid_table, size=new_T, mode='linear')
+        sinusoid_table = sinusoid_table.reshape(1, P, P, C, new_T).permute(0, 4, 1, 2, 3) # B, T, H, W, C
+        sinusoid_table = sinusoid_table.flatten(1, 3)
+        return sinusoid_table
+    else:
+        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
+        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
+        return torch.tensor(sinusoid_table, dtype=torch.float, requires_grad=False).unsqueeze(0)
+def get_sinusoid_encoding_table2(n_position=784, d_hid=1024, cur_frame=8, ckpt_num_frame=4, pre_n_position=784):
+    ''' Sinusoid position encoding table '''
+    # TODO: make it with torch instead of numpy
+    def get_position_angle_vec(position):
+        return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+    # generate checkpoint position embedding
+    sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(pre_n_position)])
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
+    sinusoid_table = torch.tensor(sinusoid_table, dtype=torch.float, requires_grad=False).unsqueeze(0)
+    print(f"n_position: {n_position}")
+    print(f"pre_n_position: {pre_n_position}")
+    if n_position != pre_n_position:
+        T = ckpt_num_frame # checkpoint frame
+        P = 14 # checkpoint size
+        C = d_hid
+        new_P = int((n_position // cur_frame) ** 0.5) # testing size
+        print(f'Pretraining uses 14x14, but current version is {new_P}x{new_P}')
+        print(f'Interpolate the position embedding')
+        sinusoid_table = sinusoid_table.reshape(-1, T, P, P, C)
+        sinusoid_table = sinusoid_table.reshape(-1, P, P, C).permute(0, 3, 1, 2)
+        sinusoid_table = torch.nn.functional.interpolate(
+            sinusoid_table, size=(new_P, new_P), mode='bicubic', align_corners=False)
+        # BT, C, H, W -> BT, H, W, C ->  B, T, H, W, C
+        sinusoid_table = sinusoid_table.permute(0, 2, 3, 1).reshape(-1, T, new_P, new_P, C)
+        sinusoid_table = sinusoid_table.flatten(1, 3)  # B, THW, C
+    if cur_frame != ckpt_num_frame:
+        print(f'Pretraining uses 4 frames, but current frame is {cur_frame}')
+        print(f'Interpolate the position embedding')
+        T = ckpt_num_frame # checkpoint frame
+        new_T = cur_frame # testing frame
+        # interpolate
+        P = int((n_position // cur_frame) ** 0.5) # testing size
+        C = d_hid
+        sinusoid_table = sinusoid_table.reshape(-1, T, P, P, C)
+        sinusoid_table = sinusoid_table.permute(0, 2, 3, 4, 1).reshape(-1, C, T)  # BHW, C, T
+        sinusoid_table = torch.nn.functional.interpolate(sinusoid_table, size=new_T, mode='linear')
+        sinusoid_table = sinusoid_table.reshape(1, P, P, C, new_T).permute(0, 4, 1, 2, 3) # B, T, H, W, C
+        sinusoid_table = sinusoid_table.flatten(1, 3)  # B, THW, C
+    return sinusoid_table
+class PretrainVisionTransformerEncoder(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None, num_frames=8, tubelet_size=1,
+                 use_learnable_pos_emb=False,
+                 use_checkpoint=False, checkpoint_num=0,
+                 ckpt_num_frame=-1, with_ln=True, return_index=-1
+                 ):
+        super().__init__()
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+            num_frames=num_frames, tubelet_size=tubelet_size
+        )
+        num_patches = self.patch_embed.num_patches
+        self.depth = depth + return_index + 1
+        self.use_checkpoint = use_checkpoint
+        self.checkpoint_num = checkpoint_num
+        logger.info(f"Use checkpoint: {use_checkpoint}")
+        logger.info(f"Checkpoint number: {checkpoint_num}")
+        logger.info(f"Real runing depth: {self.depth}")
+        # TODO: Add the cls token
+        if use_learnable_pos_emb:
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+            self.img_pos_embed = nn.Parameter(torch.zeros(1, num_patches//(num_frames//tubelet_size) + 1, embed_dim))
+        else:
+            # sine-cosine positional embeddings
+            if img_size != 224:
+                self.pos_embed = get_sinusoid_encoding_table2(num_patches, embed_dim, ckpt_num_frame=ckpt_num_frame, cur_frame=num_frames//tubelet_size)
+                self.img_pos_embed = get_sinusoid_encoding_table2(num_patches//(num_frames//tubelet_size), embed_dim, cur_frame=1, ckpt_num_frame=1, pre_n_position=14*14)
+            else:
+                self.pos_embed = get_sinusoid_encoding_table(num_patches, embed_dim, ckpt_num_frame=ckpt_num_frame, cur_frame=num_frames//tubelet_size)
+                self.img_pos_embed = get_sinusoid_encoding_table(num_patches//(num_frames//tubelet_size), embed_dim)
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                init_values=init_values)
+            for i in range(self.depth)])
+        if with_ln:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = nn.Identity()
+        if use_learnable_pos_emb:
+            trunc_normal_(self.pos_embed, std=.02)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+    def forward_features(self, x, use_image=False):
+        x = self.patch_embed(x)
+        if use_image:
+            x = x + self.img_pos_embed.type_as(x).to(x.device).clone().detach()
+        else:
+            x = x + self.pos_embed.type_as(x).to(x.device).clone().detach()
+        B, _, C = x.shape
+        x_vis = x
+        for idx, blk in enumerate(self.blocks):
+            if self.use_checkpoint and idx < self.checkpoint_num:
+                x_vis = checkpoint.checkpoint(blk, x_vis)
+            else:
+                x_vis = blk(x_vis)
+        # with ln ot not
+        x_vis = self.norm(x_vis)
+        return x_vis
+    def forward(self, x, use_image=False):
+        x_vis = self.forward_features(x, use_image)
+        return x_vis
+class PretrainVisionTransformer(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 encoder_in_chans=3,
+                 encoder_embed_dim=768,
+                 encoder_depth=12,
+                 encoder_num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 init_values=0.,
+                 use_learnable_pos_emb=False,
+                 num_frames=8,
+                 tubelet_size=1,
+                 use_checkpoint=False,
+                 checkpoint_num=0,
+                 ckpt_num_frame=4, # the pretrained model uses 4 frames
+                 return_index=-1,
+                 with_ln=False
+                ):
+        super().__init__()
+        self.encoder = PretrainVisionTransformerEncoder(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=encoder_in_chans,
+            embed_dim=encoder_embed_dim,
+            depth=encoder_depth,
+            num_heads=encoder_num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            drop_rate=drop_rate,
+            attn_drop_rate=attn_drop_rate,
+            drop_path_rate=drop_path_rate,
+            norm_layer=norm_layer,
+            init_values=init_values,
+            num_frames=num_frames,
+            tubelet_size=tubelet_size,
+            use_learnable_pos_emb=use_learnable_pos_emb,
+            use_checkpoint=use_checkpoint,
+            checkpoint_num=checkpoint_num,
+            ckpt_num_frame=ckpt_num_frame,
+            with_ln=with_ln,
+            return_index=return_index
+        )
+        logger.info(f'With LN: {with_ln}')
+        logger.info(f'Total {encoder_depth} layer')
+        logger.info(f'Return {encoder_depth+return_index+1}-th layer')
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token', 'clip_pos_embed'}
+    def forward(self, x, use_image=False):
+        T = x.shape[2]
+        x_vis = self.encoder(x, use_image) # [B, N_vis, C_e]
+        B, TL, C = x_vis.shape
+        x_vis = x_vis.view(B, T, TL // T, C)
+        return x_vis
+def build_vit(config):
+    model = PretrainVisionTransformer(
+        img_size=config.vision_encoder.img_size,
+        patch_size=config.vision_encoder.patch_size,
+        encoder_embed_dim=config.vision_encoder.encoder_embed_dim,
+        encoder_depth=config.vision_encoder.encoder_depth,
+        encoder_num_heads=config.vision_encoder.encoder_num_heads,
+        drop_path_rate=config.vision_encoder.drop_path_rate,
+        num_frames=config.vision_encoder.num_frames,
+        tubelet_size=config.vision_encoder.tubelet_size,
+        use_checkpoint=config.vision_encoder.use_checkpoint,
+        checkpoint_num=config.vision_encoder.checkpoint_num,
+        return_index=config.vision_encoder.get('return_index', -1),
+        with_ln=config.vision_encoder.get('with_ln', False),
+    )
+    model.default_cfg = _cfg()
+    if config.vision_encoder.pretrained:
+        logger.info(f"Loading pretrained weights from {config.vision_encoder.pretrained}")
+        state_dict = torch.load(config.vision_encoder.pretrained, map_location='cpu')
+        model.load_state_dict(state_dict, strict=False)
+    else:
+        logger.info("No pretrained weights!!!")
+    return model
+if __name__ == '__main__':
+    import time
+    from fvcore.nn import FlopCountAnalysis
+    from fvcore.nn import flop_count_table
+    import numpy as np
+    seed = 4217
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    num_frames = 4
+    config = {
+        'vision_encoder':
+            {
+            'img_size': 224,
+            'patch_size': 16,
+            'encoder_embed_dim': 768,
+            'encoder_depth': 12,
+            'encoder_num_heads': 12,
+            'drop_path_rate': 0.1,
+            'num_frames': num_frames,
+            'tubelet_size': 1,
+            'use_checkpoint': False,
+            'checkpoint_num': 0,
+            'pretrained': 'your_model_path/l16_25m.pth',
+            'ckpt_num_frame': 8,
+            'return_index': -1,
+            'with_ln': False,
+        }
+    }
+    from easydict import EasyDict
+    model = build_vit(EasyDict(config))
+    # flops = FlopCountAnalysis(model, torch.rand(1, 3, num_frames, 224, 224))
+    # s = time.time()
+    # print(flop_count_table(flops, max_depth=1))
+    # print(time.time()-s)
+    print(model(torch.rand(1, 3, num_frames, 224, 224), False).shape)