guoxy25 commited on Jan 23, 2025

Commit

c6dee39

verified ·

1 Parent(s): 6d3b478

Upload 56 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

added_tokens.json +56 -0
audio_modeling_baichuan.py +194 -0
config.json +407 -0
configuration_baichuan.py +111 -0
generation_config.json +6 -0
generation_utils.py +83 -0
latest +1 -0
merges.txt +0 -0
modeling_baichuan.py +1001 -0
moe.py +69 -0
processor_baichuan.py +1154 -0
pytorch_model-00001-of-00002.bin +3 -0
pytorch_model-00002-of-00002.bin +3 -0
pytorch_model.bin.index.json +833 -0
rng_state_0.pth +3 -0
rng_state_1.pth +3 -0
rng_state_10.pth +3 -0
rng_state_11.pth +3 -0
rng_state_12.pth +3 -0
rng_state_13.pth +3 -0
rng_state_14.pth +3 -0
rng_state_15.pth +3 -0
rng_state_16.pth +3 -0
rng_state_17.pth +3 -0
rng_state_18.pth +3 -0
rng_state_19.pth +3 -0
rng_state_2.pth +3 -0
rng_state_20.pth +3 -0
rng_state_21.pth +3 -0
rng_state_22.pth +3 -0
rng_state_23.pth +3 -0
rng_state_24.pth +3 -0
rng_state_25.pth +3 -0
rng_state_26.pth +3 -0
rng_state_27.pth +3 -0
rng_state_28.pth +3 -0
rng_state_29.pth +3 -0
rng_state_3.pth +3 -0
rng_state_30.pth +3 -0
rng_state_31.pth +3 -0
rng_state_4.pth +3 -0
rng_state_5.pth +3 -0
rng_state_6.pth +3 -0
rng_state_7.pth +3 -0
rng_state_8.pth +3 -0
rng_state_9.pth +3 -0
scheduler.pt +3 -0
sequence_parallel_utils.py +186 -0
special_tokens_map.json +63 -0
tokenizer.json +0 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "</tool_call>": 151658,
+  "<B_APE>": 151671,
+  "<B_CODE>": 151670,
+  "<B_FUNC>": 151669,
+  "<B_SYS>": 151665,
+  "<B_USYS>": 151666,
+  "<C_A>": 151668,
+  "<C_Q>": 151667,
+  "<audio_delim_baichuan>": 151693,
+  "<audio_end_baichuan>": 151677,
+  "<audio_pad_baichuan>": 151678,
+  "<audio_start_baichuan>": 151676,
+  "<baichuan_pad_token>": 151691,
+  "<box_delim_baichuan>": 151685,
+  "<box_end_baichuan>": 151684,
+  "<box_start_baichuan>": 151683,
+  "<calc_end>": 151674,
+  "<calc_start>": 151673,
+  "<function_calling>": 151672,
+  "<img_delim_baichuan>": 151688,
+  "<img_end_baichuan>": 151680,
+  "<img_newline_baichuan>": 151682,
+  "<img_pad_baichuan>": 151681,
+  "<img_start_baichuan>": 151679,
+  "<inner_think>": 151675,
+  "<polygon_end_baichuan>": 151690,
+  "<polygon_start_baichuan>": 151689,
+  "<ref_end_baichuan>": 151687,
+  "<ref_start_baichuan>": 151686,
+  "<reserved_113>": 151692,
+  "<tool_call>": 151657,
+  "<video_end_baichuan>": 151696,
+  "<video_palce_baichuan>": 151694,
+  "<video_start_baichuan>": 151695,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

audio_modeling_baichuan.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import torch, random, fire
+from transformers.models.whisper import WhisperConfig
+from torch.nn import functional as F
+from flash_attn import flash_attn_varlen_func
+from torch import nn
+import numpy as np
+from transformers.activations import ACT2FN
+import math
+def sinusoids(length, channels, max_timescale=10000):
+    """Returns sinusoids for positional embedding"""
+    assert channels % 2 == 0
+    log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
+    inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
+    scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
+    return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
+class BaichuanWhisperAttention(nn.Module):
+    def __init__(self, embed_dim, num_heads):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+    def forward(self, hidden_states: torch.Tensor, seq_len: torch.Tensor):
+        bsz, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states).view(bsz, self.num_heads, self.head_dim)
+        key_states = self.k_proj(hidden_states).view(bsz, self.num_heads, self.head_dim)
+        value_states = self.v_proj(hidden_states).view(bsz, self.num_heads, self.head_dim)
+        cu_len = F.pad(torch.cumsum(seq_len, dim=0), (1, 0), "constant", 0).to(torch.int32)
+        max_seqlen = torch.max(seq_len).to(torch.int32).detach()
+        attn_output = flash_attn_varlen_func(query_states, key_states, value_states, cu_len, cu_len, max_seqlen, max_seqlen, causal=False)  # (bsz * qlen, nheads, headdim)
+        attn_output = attn_output.reshape(bsz, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+class BaichuanWhisperEncoderLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = BaichuanWhisperAttention(self.embed_dim, config.encoder_attention_heads)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+    def forward(self, hidden_states: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn(hidden_states, seq_len)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = residual + hidden_states
+        if (hidden_states.dtype == torch.float16 or hidden_states.dtype == torch.bfloat16) and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+        return hidden_states
+class BaichuanAudioEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        config._attn_implementation = 'flash_attention_2'  #
+        self.config = config
+        self.max_source_positions = (config.max_audio_seconds * config.sampling_rate // config.hop_length) // config.stride_size
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        # 需要在LLM的初始化中注册注册
+        self.conv1 = nn.Conv1d(config.num_mel_bins, config.d_model, kernel_size=config.kernel_size, padding=1)
+        self.conv2 = nn.Conv1d(config.d_model, config.d_model, kernel_size=config.kernel_size, stride=config.stride_size, padding=1)
+        self.register_buffer("positional_embedding", sinusoids(self.max_source_positions, config.d_model))  # 1500 * d
+        self.layers = nn.ModuleList([BaichuanWhisperEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layer_norm = nn.LayerNorm(config.d_model)
+        self.gradient_checkpointing = True
+    @torch.no_grad()
+    def fake_input(self, device):
+        input_features = torch.rand([2, self.config.num_mel_bins, 10], dtype=torch.float32, device=device)
+        encoder_length = torch.ones([2], dtype=torch.int32, device=device) * 3
+        bridge_length = torch.ones([2], dtype=torch.int32, device=device)
+        return input_features, encoder_length, bridge_length
+    def forward(
+        self,
+        input_features,
+        output_length,  # MAKESURE 输入的必须是两次conv计算后的hidden state长度
+    ):
+        input_features = input_features.to(self.conv1.weight.dtype)
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))  # (bs, channels, frames)
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))  # (bs, channels, frames // 2)
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)  # (bs, frams, channels)
+        bsz, tgt_len, _ = inputs_embeds.size()  # 当前batch最大长度
+        if tgt_len < self.positional_embedding.shape[0]:
+            current_positional_embedding = self.positional_embedding[:tgt_len]
+        else:
+            current_positional_embedding = self.positional_embedding
+        hidden_states = (inputs_embeds.to(torch.float32) + current_positional_embedding).to(inputs_embeds.dtype)
+        # packing hidden states
+        attention_mask = torch.arange(0, tgt_len).to(hidden_states.device)
+        attention_mask = torch.lt(attention_mask, output_length.reshape(bsz, 1)).view(bsz, tgt_len, 1)
+        unpacking_index = torch.cumsum(attention_mask.to(torch.int32).view(-1), dim=0) - 1  # 转成下标
+        hidden_states = torch.masked_select(hidden_states, attention_mask).view(torch.sum(output_length), self.config.d_model)
+        for idx, encoder_layer in enumerate(self.layers):
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    output_length
+                )
+            else:
+                hidden_states = encoder_layer(hidden_states, output_length)
+        hidden_states = self.layer_norm(hidden_states)
+        # unpacking
+        hidden_states = torch.index_select(hidden_states, 0, unpacking_index).view(bsz, tgt_len, self.config.d_model)
+        hidden_states = torch.where(attention_mask, hidden_states, 0)
+        return hidden_states
+class BaichuanAudioBridge(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config.audio_config
+        if self.config.avg_pooler > 1:
+            self.avg_pooler = nn.AvgPool1d(self.config.avg_pooler, stride=2)
+        else:
+            self.avg_pooler = None
+        self.proj1 = nn.Linear(self.config.d_model, config.hidden_size)
+        self.proj2 = nn.Linear(config.hidden_size, config.hidden_size)
+    def forward(self, x, output_length):
+        if self.avg_pooler is not None:
+            x = x.permute(0, 2, 1)
+            x = self.avg_pooler(x)
+            x = x.permute(0, 2, 1)
+        batch_size, sl, _ = x.shape
+        output_length = output_length.to(x.device)
+        valid_mask = torch.arange(0, sl).to(x.device)
+        valid_mask = torch.lt(valid_mask, output_length.reshape(batch_size, 1)).reshape(batch_size, sl, 1)
+        x = torch.masked_select(x, valid_mask).reshape(-1, self.config.d_model) # (sum(valid_sequence_length), d)
+        x = ACT2FN[self.config.activation_function](self.proj1(x))
+        x = self.proj2(x)
+        return x
+def test_audio():
+    from transformers import AutoConfig
+    from processor_baichuan import BaichuanAudioProcessor
+    # from ..configuration_baichuan import BaichuanConfig
+    config = AutoConfig.from_pretrained("./", trust_remote_code=True)
+    config.audio_config.d_model = 24
+    config.audio_config.encoder_layers = 2
+    config.audio_config.encoder_attention_heads = 4
+    config.audio_config.encoder_ffn_dim = 48
+    ae = BaichuanAudioEncoder(config.audio_config).cuda().to(torch.bfloat16)
+    bg = BaichuanAudioBridge(config).cuda().to(torch.bfloat16)
+    l = random.randint(10, 30)
+    bs = 3
+    input_length = torch.tensor([random.randint(1, l) for _ in range(bs)])
+    encoder_length, bridge_length = BaichuanAudioProcessor.inference_output_length(config.audio_config, input_length)
+    print("l={}, input_valid_length={},\nencoder_valid_length={}, bridge_valid_length={}".format(l, input_length, encoder_length, bridge_length))
+    wave_features = torch.rand((bs, config.audio_config.num_mel_bins, l))
+    a = ae(wave_features.to('cuda'), encoder_length.to('cuda'))
+    b = bg(a, bridge_length.to('cuda'))
+    print('encoder output={}, bridge output={}'.format(a.shape, b.shape))
+    print(a)
+    print(b)
+if __name__ == '__main__':
+    fire.Fire()

config.json ADDED Viewed

	@@ -0,0 +1,407 @@

+{
+  "_name_or_path": "/cpfs/29f69eb5e2e60f26/code/mllm/zhangtao02/workspace/3_bc_mllm/models/mm_pretrain/cs_ocr3b/cs_ocr3b_ift_1204_ckpt_epoch_1_12030256",
+  "architectures": [
+    "BaichuanForCausalLM"
+  ],
+  "attention_qkv_bias": true,
+  "attention_qkv_pack": false,
+  "audio_config": {
+    "_name_or_path": "",
+    "activation_dropout": 0.0,
+    "activation_function": "gelu",
+    "add_cross_attention": false,
+    "apply_spec_augment": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "audio_delim_token_id": 151693,
+    "audio_end_token_id": 151677,
+    "audio_pad_token_id": 151678,
+    "audio_start_token_id": 151676,
+    "avg_pooler": 2,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": [
+      220,
+      50256
+    ],
+    "bos_token_id": 50256,
+    "chunk_size_feed_forward": 0,
+    "classifier_proj_size": 256,
+    "cross_attention_hidden_size": null,
+    "d_model": 1280,
+    "decoder_attention_heads": 6,
+    "decoder_ffn_dim": 1536,
+    "decoder_layerdrop": 0.0,
+    "decoder_layers": 4,
+    "decoder_start_token_id": 50257,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "enable": false,
+    "encoder_attention_heads": 20,
+    "encoder_ffn_dim": 5120,
+    "encoder_layerdrop": 0.0,
+    "encoder_layers": 32,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 50256,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hop_length": 160,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "init_std": 0.02,
+    "is_decoder": false,
+    "is_encoder_decoder": true,
+    "kernel_size": 3,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "mask_feature_length": 10,
+    "mask_feature_min_masks": 0,
+    "mask_feature_prob": 0.0,
+    "mask_time_length": 10,
+    "mask_time_min_masks": 2,
+    "mask_time_prob": 0.05,
+    "max_audio_seconds": 30,
+    "max_length": 20,
+    "max_source_positions": 1500,
+    "max_target_positions": 448,
+    "median_filter_width": 7,
+    "min_length": 0,
+    "model_type": "whisper",
+    "n_fft": 400,
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 32,
+    "num_mel_bins": 128,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 50256,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sampling_rate": 16000,
+    "scale_embedding": false,
+    "scaling_embedding": false,
+    "sep_token_id": null,
+    "split_overlap": 0.1,
+    "stride_size": 2,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "use_weighted_layer_sum": false,
+    "vocab_size": 51865
+  },
+  "auto_map": {
+    "AutoConfig": "configuration_baichuan.BaichuanConfig",
+    "AutoModelForCausalLM": "modeling_baichuan.BaichuanForCausalLM"
+  },
+  "baichuan_tokenizer_type": "auto",
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 8192,
+  "max_window_layers": 36,
+  "model_type": "baichuan",
+  "moe": false,
+  "multimodal": [
+    "image"
+  ],
+  "multimodal_special_token_list": [
+    151676,
+    151677,
+    151678,
+    151679,
+    151680,
+    151681,
+    151682,
+    151683,
+    151684,
+    151685,
+    151686,
+    151687,
+    151688,
+    151693,
+    151694,
+    151695,
+    151696
+  ],
+  "multimodal_special_token_no_loss_list": [
+    151676,
+    151677,
+    151678,
+    151679,
+    151680,
+    151681,
+    151682,
+    151683,
+    151684,
+    151685,
+    151686,
+    151687,
+    151688,
+    151693,
+    151694,
+    151695,
+    151696
+  ],
+  "num_attention_heads": 16,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 2,
+  "pad_token_id": 0,
+  "position_embedding_type": "rope",
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "sparse_attention_heads": null,
+  "sparse_attention_layers": [],
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "train_multimodal_special_tokens_only": false,
+  "transformers_version": "4.45.0.dev0",
+  "use_cache": false,
+  "use_norm_head": false,
+  "use_sliding_window": false,
+  "video_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decode_way": "1fps",
+    "decoder_start_token_id": null,
+    "depth": 32,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "embed_dim": 1280,
+    "enable": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 2048,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_delimiter_token_id": 151688,
+    "image_end_token_id": 151680,
+    "image_line_token_id": 151682,
+    "image_mean": [
+      0.48145466,
+      0.4578275,
+      0.40821073
+    ],
+    "image_pad_token_id": 151681,
+    "image_size": 224,
+    "image_start_token_id": 151679,
+    "image_std": [
+      0.26862954,
+      0.26130258,
+      0.27577711
+    ],
+    "in_channels": 3,
+    "in_chans": 3,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_frame_num": 128,
+    "max_length": 20,
+    "max_pixels": 784000,
+    "merge_size": 4,
+    "min_length": 0,
+    "min_pixels": 3136,
+    "mlp_ratio": 4,
+    "model_type": "clip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_heads": 16,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 512,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "spatial_merge_size": 2,
+    "spatial_patch_size": 14,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "temporal_patch_size": 2,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "video_end_token_id": 151696,
+    "video_place_token_id": 151694,
+    "video_start_token_id": 151695
+  },
+  "visual_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "depth": 32,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "embed_dim": 1280,
+    "enable": true,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 2048,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_delimiter_token_id": 151688,
+    "image_end_token_id": 151680,
+    "image_line_token_id": 151682,
+    "image_mean": [
+      0.48145466,
+      0.4578275,
+      0.40821073
+    ],
+    "image_pad_token_id": 151681,
+    "image_size": 224,
+    "image_start_token_id": 151679,
+    "image_std": [
+      0.26862954,
+      0.26130258,
+      0.27577711
+    ],
+    "in_channels": 3,
+    "in_chans": 3,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_pixels": 784000,
+    "merge_size": 2,
+    "min_length": 0,
+    "min_pixels": 3136,
+    "mlp_ratio": 4,
+    "model_type": "clip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_heads": 16,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 512,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "spatial_merge_size": 2,
+    "spatial_patch_size": 14,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "temporal_patch_size": 2,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  },
+  "vocab_size": 151936
+}

configuration_baichuan.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright 2023 Baichuan Inc. All Rights Reserved.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from transformers import WhisperConfig
+from transformers import CLIPVisionConfig
+logger = logging.get_logger(__name__)
+class BaichuanConfig(PretrainedConfig):
+    model_type = "baichuan"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=125696,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        sparse_attention_heads=None,
+        sparse_attention_layers=[],
+        head_dim=None,
+        attention_qkv_pack=True,
+        attention_qkv_bias=False,
+        use_norm_head=True,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        position_embedding_type="rope",
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        audio_config=None,
+        visual_config=None,
+        video_config=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads or self.num_attention_heads
+        self.sparse_attention_heads = sparse_attention_heads
+        self.sparse_attention_layers = sparse_attention_layers
+        self.head_dim = head_dim or self.hidden_size // self.num_attention_heads
+        self.attention_qkv_pack = attention_qkv_pack
+        self.attention_qkv_bias = attention_qkv_bias
+        self.use_norm_head = use_norm_head
+        self.hidden_act = hidden_act
+        self.position_embedding_type = position_embedding_type
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        assert self.position_embedding_type.lower() in ("rope", "alibi")
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        if audio_config is not None:
+            self.audio_config = WhisperConfig(**audio_config)
+        if visual_config is not None:
+            self.visual_config = CLIPVisionConfig(**visual_config)
+        if video_config is not None:
+            self.video_config = CLIPVisionConfig(**video_config)
+    def to_diff_dict(self):
+        data = super().to_diff_dict()
+        data["model_type"] = self.model_type
+        return data
+    def get_rotary_base(self):
+        if hasattr(self, "rotary_emb_base"):
+            return self.rotary_emb_base
+        else:
+            return self.rope_theta
+if __name__ == '__main__':
+    from transformers import AutoConfig
+    config = AutoConfig.from_pretrained("./", trust_remote_code=True)
+    print(config)

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "max_new_tokens": 2048,
+  "transformers_version": "4.45.0.dev0"
+}

generation_utils.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from typing import List
+from queue import Queue
+import torch
+def build_chat_input(model, tokenizer, messages: List[dict], max_new_tokens: int=0):
+    def _parse_messages(messages, split_role="user"):
+        system, rounds = "", []
+        round = []
+        for i, message in enumerate(messages):
+            if message["role"] == "system":
+                assert i == 0
+                system = message["content"]
+                continue
+            if message["role"] == split_role and round:
+                rounds.append(round)
+                round = []
+            round.append(message)
+        if round:
+            rounds.append(round)
+        return system, rounds
+    max_new_tokens = max_new_tokens or model.generation_config.max_new_tokens
+    max_input_tokens = model.config.model_max_length - max_new_tokens
+    system, rounds = _parse_messages(messages, split_role="user")
+    system_tokens = tokenizer.encode(system)
+    max_history_tokens = max_input_tokens - len(system_tokens)
+    history_tokens = []
+    for round in rounds[::-1]:
+        round_tokens = []
+        for message in round:
+            if message["role"] == "user":
+                round_tokens.append(model.generation_config.user_token_id)
+            else:
+                round_tokens.append(model.generation_config.assistant_token_id)
+            round_tokens.extend(tokenizer.encode(message["content"]))
+        if len(history_tokens) == 0 or len(history_tokens) + len(round_tokens) <= max_history_tokens:
+            history_tokens = round_tokens + history_tokens  # concat left
+            if len(history_tokens) < max_history_tokens:
+                continue
+        break
+    input_tokens = system_tokens + history_tokens
+    if messages[-1]["role"] != "assistant":
+        input_tokens.append(model.generation_config.assistant_token_id)
+    input_tokens = input_tokens[-max_input_tokens:]  # truncate left
+    return torch.LongTensor([input_tokens]).to(model.device)
+class TextIterStreamer:
+    def __init__(self, tokenizer, skip_prompt=False, skip_special_tokens=False):
+        self.tokenizer = tokenizer
+        self.skip_prompt = skip_prompt
+        self.skip_special_tokens = skip_special_tokens
+        self.tokens = []
+        self.text_queue = Queue()
+        self.next_tokens_are_prompt = True
+    def put(self, value):
+        if self.skip_prompt and self.next_tokens_are_prompt:
+            self.next_tokens_are_prompt = False
+        else:
+            if len(value.shape) > 1:
+                value = value[0]
+            self.tokens.extend(value.tolist())
+            self.text_queue.put(
+                self.tokenizer.decode(self.tokens, skip_special_tokens=self.skip_special_tokens))
+    def end(self):
+        self.text_queue.put(None)
+    def __iter__(self):
+        return self
+    def __next__(self):
+        value = self.text_queue.get()
+        if value is None:
+            raise StopIteration()
+        else:
+            return value

latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step8877

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_baichuan.py ADDED Viewed

	@@ -0,0 +1,1001 @@

+# Copyright 2023 Baichuan Inc. All Rights Reserved.
+#
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Baichuan model."""
+import os
+import json
+import math
+from typing import List, Optional, Tuple, Union
+from threading import Thread
+from easydict import EasyDict
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from torch.nn import functional as F
+from transformers import PreTrainedModel
+from transformers.activations import ACT2FN
+from dataclasses import dataclass
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, ModelOutput
+from transformers.generation.utils import GenerationConfig
+from transformers.utils import logging
+from .configuration_baichuan import BaichuanConfig
+from .audio_modeling_baichuan import BaichuanAudioEncoder, BaichuanAudioBridge
+from .visual_modeling_baichuan import BaichuanVisualEncoder, BaichuanVisualBridge
+from .processor_baichuan import BaichuanMMProcessor
+from .moe import moe_matmul
+# support model path contain point(.)
+try:
+    # step1: copy relative imports to transformers_modules
+    from .generation_utils import build_chat_input, TextIterStreamer
+    from .sequence_parallel_utils import (
+        create_attention_layer,
+        get_sequence_parallel_size,
+        get_sequence_parallel_chunk,
+    )
+except ModuleNotFoundError:
+    # step2: direct import from transformers_modules
+    try:  # bypass check_imports failure
+        import sys
+        sys.path.append(os.path.dirname(__file__))
+        from generation_utils import build_chat_input, TextIterStreamer
+        from sequence_parallel_utils import (
+            create_attention_layer,
+            get_sequence_parallel_size,
+            get_sequence_parallel_chunk,
+        )
+    except Exception:
+        raise
+logger = logging.get_logger(__name__)
+def get_slopes(n):
+    def get_slopes_power_of_2(n):
+        start = (2 ** (-2 ** -(math.log2(n) - 3)))
+        ratio = start
+        return [start * ratio ** i for i in range(n)]
+    if math.log2(n).is_integer():
+        return get_slopes_power_of_2(
+            n)  # In the paper, we only train models that have 2^a heads for some a. This function has
+    else:  # some good properties that only occur when the input is a power of 2. To maintain that even
+        closest_power_of_2 = 2 ** math.floor(
+            math.log2(n))  # when the number of heads is not a power of 2, we use this workaround.
+        return get_slopes_power_of_2(closest_power_of_2) + get_slopes(2 * closest_power_of_2)[0::2][
+                                                           :n - closest_power_of_2]
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+        return self.weight * hidden_states
+class RotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=5e6, device=None):
+        super().__init__()
+        # 修复RePE初始化精度问题 https://zhuanlan.zhihu.com/p/678963442
+        # DeepSpeed 会 Hack torch.arange 强制在 GPU 上运行，这里使用原生的 torch.arange
+        try:
+            import deepspeed
+            self.arange = deepspeed.runtime.zero.partition_parameters._orig_torch_arange
+        except:
+            self.arange = torch.arange
+        self.inv_freq = 1.0 / (base ** (self.arange(0, dim, 2).float().to(device) / dim))
+        self.max_seq_len_cached = max_position_embeddings
+        t = self.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=torch.float32)
+        freqs = torch.outer(t, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.cos_cached = emb.cos()[None, None, :, :].to(torch.float32)
+        self.sin_cached = emb.sin()[None, None, :, :].to(torch.float32)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
+        if seq_len > self.max_seq_len_cached:
+            self.max_seq_len_cached = seq_len
+            t = self.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=torch.float32)
+            freqs = torch.outer(t, self.inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            self.cos_cached = emb.cos()[None, None, :, :].to(torch.float32).to(x.device)
+            self.sin_cached = emb.sin()[None, None, :, :].to(torch.float32).to(x.device)
+        return (
+            self.cos_cached[:, :, :seq_len, ...].to(torch.float32).to(x.device),
+            self.sin_cached[:, :, :seq_len, ...].to(torch.float32).to(x.device),
+        )
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos_, sin_, position_ids):
+    cos = cos_.squeeze(1).squeeze(0)  # [seq_len, dim]
+    sin = sin_.squeeze(1).squeeze(0)  # [seq_len, dim]
+    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    q_embed = (q.float() * cos) + (rotate_half(q.float()) * sin)
+    k_embed = (k.float() * cos) + (rotate_half(k.float()) * sin)
+    return q_embed.to(q.dtype), k_embed.to(k.dtype)
+class MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+    ):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.act_fn = ACT2FN[hidden_act]
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: BaichuanConfig, is_sparse=False):
+        super().__init__()
+        self.config = config
+        self.position_embedding_type = config.position_embedding_type.lower()
+        self.num_kv_heads = config.num_key_value_heads
+        self.head_dim = config.head_dim
+        self.hidden_size = config.num_attention_heads * self.head_dim
+        self.hidden_kv_size = self.num_kv_heads * self.head_dim
+        if is_sparse:
+            self.num_heads = config.sparse_attention_heads
+            assert self.num_kv_heads == config.num_attention_heads
+            self.W_pack = nn.Linear(self.hidden_size, 3 * self.num_heads * self.head_dim, bias=config.attention_qkv_bias)
+            self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        else:
+            self.num_heads = config.num_attention_heads
+            if self.config.attention_qkv_pack:
+                self.W_pack = nn.Linear(config.hidden_size, self.hidden_size + self.hidden_kv_size * 2, bias=config.attention_qkv_bias)
+                if config.moe:
+                    self.moe_W_pack = nn.Linear(config.hidden_size, self.hidden_size + self.hidden_kv_size * 2, bias=False)
+            else:
+                self.q_proj = nn.Linear(config.hidden_size, self.hidden_size, bias=config.attention_qkv_bias)
+                self.k_proj = nn.Linear(config.hidden_size, self.hidden_kv_size, bias=config.attention_qkv_bias)
+                self.v_proj = nn.Linear(config.hidden_size, self.hidden_kv_size, bias=config.attention_qkv_bias)
+            self.o_proj = nn.Linear(self.num_heads * self.head_dim, config.hidden_size, bias=False)
+            if config.moe:
+                self.moe_o_proj = nn.Linear(self.num_heads * self.head_dim, config.hidden_size, bias=False)
+        if self.position_embedding_type == 'rope':
+            self.rotary_emb = RotaryEmbedding(
+                dim=self.head_dim,
+                max_position_embeddings=config.max_position_embeddings,
+                base=config.get_rotary_base()
+            )
+        elif self.position_embedding_type == 'alibi':
+            self.alibi_slopes = get_slopes(self.num_heads)
+        self.attention = create_attention_layer(self.hidden_size, self.num_heads, self.head_dim)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def _repeat_kv(self, hidden_states: torch.Tensor, num_heads: int) -> torch.Tensor:
+        assert hidden_states.size(1) <= num_heads and num_heads % hidden_states.size(1) == 0
+        return repeat_kv(hidden_states, num_heads // hidden_states.size(1))
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        seqlens: Optional[torch.IntTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        group_index=None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len = hidden_states.shape[:2]
+        if self.config.attention_qkv_pack:
+            if self.config.moe and group_index is not None:
+                proj = moe_matmul(hidden_states, [self.W_pack.weight, self.moe_W_pack.weight], group_index, lambda x, y: torch.einsum('bd,ld->bl', x, y))
+                if self.config.attention_qkv_bias:
+                    proj += self.W_pack.bias
+            else:
+                proj = self.W_pack(hidden_states)
+            query_states, key_states, value_states = proj.split([self.hidden_size, self.hidden_kv_size, self.hidden_kv_size], dim=-1)
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+        # (B, S, hidden_size) -> (B, num_heads, S, head_size)
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        # (B, S, hidden_size) -> (B, num_kv_heads, S, head_size)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        if self.position_embedding_type == 'rope':
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len * get_sequence_parallel_size())
+            query_states, key_states = apply_rotary_pos_emb(
+                query_states, key_states, cos, sin,
+                get_sequence_parallel_chunk(position_ids)
+            )
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = self._repeat_kv(key_states, query_states.size(1))
+        value_states = self._repeat_kv(value_states, query_states.size(1))
+        if seqlens is not None:
+            seqlens = seqlens.to(dtype=torch.int32)
+            max_seqlen = (seqlens[1:] - seqlens[:-1]).max().item()
+            if self.position_embedding_type == 'alibi':
+                alibi_slopes = torch.tensor(self.alibi_slopes, dtype=torch.float32).to(query_states.device)
+            else:
+                alibi_slopes = None
+            attn_output = self.attention(
+                query_states, key_states, value_states, seqlens, seqlens,
+                max_seqlen, max_seqlen, causal=True, alibi_slopes=alibi_slopes, use_flash=True)
+        else:
+            attn_output = self.attention(
+                query_states, key_states, value_states, attn_mask=attention_mask, use_flash=False)
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+        if not self.config.moe or group_index is None:
+            attn_output = self.o_proj(attn_output)
+        else:
+            attn_output = moe_matmul(attn_output, [self.o_proj.weight, self.moe_o_proj.weight], group_index, lambda x, y: torch.einsum('bd,ld->bl', x, y))
+        return attn_output, None, past_key_value
+class DecoderLayer(nn.Module):
+    def __init__(self, config: BaichuanConfig, is_sparse=False):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Attention(config=config, is_sparse=is_sparse)
+        self.mlp = MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        seqlens: Optional[torch.IntTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        group_index=None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            seqlens=seqlens,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            group_index=group_index,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+class BaichuanPreTrainedModel(PreTrainedModel):
+    config_class = BaichuanConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DecoderLayer"]
+    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear) or isinstance(module, nn.Conv1d):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BaichuanModel):
+            module.gradient_checkpointing = value
+class BaichuanModel(BaichuanPreTrainedModel):
+    def __init__(self, config: BaichuanConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.merge_size = 1
+        if config.audio_config.enable:
+            self.audio_model = BaichuanAudioEncoder(config.audio_config)
+            self.audio_bridge_model = BaichuanAudioBridge(config)
+        if config.visual_config.enable:
+            self.visual_model = BaichuanVisualEncoder(config.visual_config)
+            self.visual_bridge_model = BaichuanVisualBridge(config.visual_config)
+            self.merge_size = max(config.visual_config.merge_size, self.merge_size)
+        if config.video_config.enable: # in case 没有visual_config而只有video_config
+            if not config.visual_config.enable:
+                self.visual_model = BaichuanVisualEncoder(config.video_config)
+            self.video_bridge_model = BaichuanVisualBridge(config.video_config)
+            self.merge_size = max(config.video_config.merge_size, self.merge_size)
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([
+            DecoderLayer(config, is_sparse=layer_idx in config.sparse_attention_layers)
+            for layer_idx in range(config.num_hidden_layers)
+        ])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = True
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def get_multimodal_mask(self, input_ids, pad_token_id, special_token_list):
+        '''
+        获取任意模态的特殊mask，包含以下
+        1. pad mask 表示文本中图像/语音/视频模态提前留出的token位置
+        2. special token mask 特殊token 例如对理解模型<start> <end> 不需要next token prediction
+        3. embedding mask / lm_head mask 标记出特殊token在embedding中的mask
+        '''
+        pad_mask = torch.eq(input_ids, pad_token_id)
+        sp_mask = torch.zeros_like(input_ids, dtype=torch.bool)
+        lm_head_mask = torch.zeros([self.config.vocab_size, 1], dtype=torch.bool)
+        for sp_id in special_token_list:
+            sp_mask = torch.logical_or(sp_mask, torch.eq(input_ids, sp_id))
+            lm_head_mask[sp_id, 0] = True
+        return pad_mask, sp_mask, lm_head_mask
+    def get_audio_embed(
+            self,
+            input_ids,
+            text_embedding,  # 1. self.embed_tokens(input_ids) 2. 其他模态结果
+            features,  # list of tensors
+            encoder_length,
+            bridge_length,
+            group_index=None,  # 某种模态的编号 for MoE
+        ):
+        pad_mask, sp_mask, _ = self.get_multimodal_mask(input_ids, self.config.audio_config.audio_pad_token_id, self.config.multimodal_special_token_list)
+        if features is None or len(features) <= 0 : # 空list or None 保证梯度回传
+            features, encoder_length, bridge_length = self.audio_model.fake_input(input_ids.device)
+            fake_input = True
+        else:
+            fake_input = False
+        audio_embed = self.audio_model(features, encoder_length)
+        audio_embed = self.audio_bridge_model(audio_embed, bridge_length)  # (?, d)
+        if not self.training:  # 推理支持auto map 把多模态模块输出和input_ids 统一到一个device
+            audio_embed = audio_embed.to(input_ids.device)
+        if not fake_input:  # 检查多模态token 和 pad mask数量一致 （不正确的截断会导致该问题）
+            assert pad_mask.sum() == audio_embed.shape[0]
+        else:
+            assert pad_mask.sum() <= 0  # 0 vs 1
+        # 合并 当前模态embeddings 和text embeddings
+        input_ids = torch.where(pad_mask, torch.cumsum(pad_mask.view(-1).to(input_ids), dim=0).view(input_ids.shape)-1, input_ids)
+        if self.config.train_multimodal_special_tokens_only and self.training:
+            # 仅special token传梯度到embedding weight, 保证LLM部分不变
+            # 注意: 多种模态之间special token list应该共享，否则会有部分被stop gradient
+            sp_mask = sp_mask.unsqueeze(-1).to(text_embedding)
+            text_embedding = (1 - sp_mask) * text_embedding.detach() + sp_mask * text_embedding
+        text_embedding = (1 - pad_mask.to(text_embedding)).unsqueeze(-1) * text_embedding  # pad token位置填0 (不传梯度)
+        multimodal_embedding = torch.embedding(audio_embed, input_ids * pad_mask)  # 非 pad token 位置填idx=0位置结果
+        multimodal_embedding = pad_mask.to(multimodal_embedding).unsqueeze(-1) * multimodal_embedding  # 非pad token 位置填0
+        final_embedding = multimodal_embedding.to(text_embedding) + text_embedding
+        if group_index is None:
+            group_index = pad_mask.to(torch.int32)
+        else:
+            current_index = torch.max(group_index) + 1
+            group_index += pad_mask.to(torch.int32) * current_index  # 假设模态无重叠
+        return final_embedding, group_index  # group_index 不传None 防止MoE部分参数无梯度
+    def get_visual_embed(
+            self,
+            input_ids,
+            text_embedding,  # 1. self.embed_tokens(input_ids) 2. 其他模态结果
+            images,
+            group_index,  # 某种模态的编号 for MoE
+            images_grid
+        ):
+        # TODO 与get_audio_embed合并重复功能 减少冗余代码
+        pad_mask, sp_mask, _ = self.get_multimodal_mask(input_ids, self.config.visual_config.image_pad_token_id, self.config.multimodal_special_token_list)
+        if images is None or len(images) <= 0 : # 空list or None 保证梯度回传
+            images = self.visual_model.fake_input(input_ids.device, self.merge_size)
+            images_grid = [(1, self.merge_size, self.merge_size)]
+            fake_input = True
+        else:
+            fake_input = False
+        images = torch.cat(images, dim=0)
+        images_grid = torch.tensor(np.array(images_grid))
+        visual_embed = self.visual_model(images, grid_thw=images_grid)
+        visual_embed = self.visual_bridge_model(visual_embed)
+        if not self.training:  # 推理支持auto map 把多模态模块输出和input_ids 统一到一个device
+            visual_embed = visual_embed.to(input_ids.device)
+        if not fake_input:  # 检查多模态token 和 pad mask数量一致 （不正确的截断会导致该问题）
+            assert pad_mask.sum() == visual_embed.shape[0], '{} != {}'.format(pad_mask.sum(), visual_embed.shape[0])
+        else:
+            assert pad_mask.sum() <= 0, '{} != {}'.format(pad_mask.sum(), visual_embed.shape[0])
+        # 合并 当前模态embeddings 和text embeddings
+        input_ids = torch.where(pad_mask, torch.cumsum(pad_mask.view(-1).to(input_ids), dim=0).view(input_ids.shape)-1, input_ids)
+        if self.config.train_multimodal_special_tokens_only and self.training:
+            # 仅special token传梯度到embedding weight, 保证LLM部分不变
+            # 注意: 多种模态之间special token list应该共享，否则会有部分被stop gradient
+            sp_mask = sp_mask.unsqueeze(-1).to(text_embedding)
+            text_embedding = (1 - sp_mask) * text_embedding.detach() + sp_mask * text_embedding
+        text_embedding = (1 - pad_mask.to(text_embedding)).unsqueeze(-1) * text_embedding  # pad token位置填0 (不传梯度)
+        multimodal_embedding = torch.embedding(visual_embed, input_ids * pad_mask)  # 非 pad token 位置填idx=0位置结果
+        multimodal_embedding = pad_mask.to(multimodal_embedding).unsqueeze(-1) * multimodal_embedding  # 非pad token 位置填0
+        final_embedding = multimodal_embedding.to(text_embedding) + text_embedding
+        if group_index is None:
+            group_index = pad_mask.to(torch.int32)
+        else:
+            current_index = torch.max(group_index) + 1
+            group_index += pad_mask.to(torch.int32) * current_index  # 假设模态无重叠
+        return final_embedding, group_index  # group_index 不传None 防止MoE部分参数无梯度
+    def get_video_embed(
+            self,
+            input_ids,
+            text_embedding,  # 1. self.embed_tokens(input_ids) 2. 其他模态结果
+            images,
+            group_index,  # 某种模态的编号 for MoE
+            images_grid
+        ):
+        # TODO 与get_audio_embed合并重复功能 减少冗余代码
+        pad_mask, sp_mask, _ = self.get_multimodal_mask(input_ids, self.config.video_config.video_place_token_id, self.config.multimodal_special_token_list)
+        if images is None or len(images) <= 0 : # 空list or None 保证梯度回传
+            images = self.visual_model.fake_input(input_ids.device, self.merge_size)
+            images_grid = [(1, self.merge_size, self.merge_size)]
+            fake_input = True
+        else:
+            fake_input = False
+        images = torch.cat(images, dim=0)
+        images_grid = torch.tensor(np.array(images_grid))
+        visual_embed = self.visual_model(images, grid_thw=images_grid)
+        visual_embed = self.video_bridge_model(visual_embed)
+        if not self.training:  # 推理支持auto map 把多模态模块输出和input_ids 统一到一个device
+            visual_embed = visual_embed.to(input_ids.device)
+        if not fake_input:  # 检查多模态token 和 pad mask数量一致 （不正确的截断会导致该问题）
+            assert pad_mask.sum() == visual_embed.shape[0], '{} != {}'.format(pad_mask.sum(), visual_embed.shape[0])
+            assert pad_mask.sum() == visual_embed.shape[0], '{} != {}'.format(pad_mask.sum(), visual_embed.shape[0])
+        else:
+            assert pad_mask.sum() <= 0, '{} != {}'.format(pad_mask.sum(), visual_embed.shape[0])
+        # 合并 当前模态embeddings 和text embeddings
+        input_ids = torch.where(pad_mask, torch.cumsum(pad_mask.view(-1).to(input_ids), dim=0).view(input_ids.shape)-1, input_ids)
+        if self.config.train_multimodal_special_tokens_only and self.training:
+            # 仅special token传梯度到embedding weight, 保证LLM部分不变
+            # 注意: 多种模态之间special token list应该共享，否则会有部分被stop gradient
+            sp_mask = sp_mask.unsqueeze(-1).to(text_embedding)
+            text_embedding = (1 - sp_mask) * text_embedding.detach() + sp_mask * text_embedding
+        text_embedding = (1 - pad_mask.to(text_embedding)).unsqueeze(-1) * text_embedding  # pad token位置填0 (不传梯度)
+        multimodal_embedding = torch.embedding(visual_embed, input_ids * pad_mask)  # 非 pad token 位置填idx=0位置结果
+        multimodal_embedding = pad_mask.to(multimodal_embedding).unsqueeze(-1) * multimodal_embedding  # 非pad token 位置填0
+        final_embedding = multimodal_embedding.to(text_embedding) + text_embedding
+        if group_index is None:
+            group_index = pad_mask.to(torch.int32)
+        else:
+            current_index = torch.max(group_index) + 1
+            group_index += pad_mask.to(torch.int32) * current_index  # 假设模态无重叠
+        return final_embedding, group_index  # group_index 不传None 防止MoE部分参数无梯度
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        seqlens: Optional[torch.IntTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        audios: Optional[List|torch.Tensor] = None,
+        encoder_length: Optional[torch.Tensor] = None,
+        bridge_length: Optional[torch.Tensor] = None,
+        images: Optional[List|torch.Tensor] = None,
+        images_grid: Optional[List|torch.Tensor] = None,
+        videos: Optional[List|torch.Tensor] = None,
+        videos_grid: Optional[List|torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        group_index = None
+        if inputs_embeds is None:
+            sp_input_ids = get_sequence_parallel_chunk(input_ids)
+            inputs_embeds = self.embed_tokens(sp_input_ids)
+            if self.config.audio_config.enable:
+                inputs_embeds, group_index = self.get_audio_embed(sp_input_ids, inputs_embeds, audios, encoder_length, bridge_length)
+            if self.config.visual_config.enable:
+                inputs_embeds, group_index = self.get_visual_embed(sp_input_ids, inputs_embeds, images, group_index, images_grid)  # 注意更新group index
+            if self.config.video_config.enable:
+                inputs_embeds, group_index = self.get_video_embed(sp_input_ids, inputs_embeds, videos, group_index, videos_grid)  # 注意更新group index
+        if seqlens is not None and seqlens.ndim == 2:
+            # batch multi-pack 样本拉平
+            cu_seqlens = []
+            offset, seqlen = 0, seqlens.size(1)
+            for lens in seqlens:
+                cu_seqlens.append(offset)
+                cu_seqlens.extend((lens[(lens > 0) & (lens < seqlen)] + offset).tolist())
+                offset += seqlen
+            cu_seqlens.append(offset)
+            seqlens = torch.tensor(cu_seqlens, dtype=seqlens.dtype, device=seqlens.device)
+        elif seqlens is None and self.training:
+            # 兼容预训练场景, 此时 seqlens=None, 默认 maxlength
+            seqlens = torch.arange(
+                end=input_ids.size(0) + 1,
+                dtype=torch.int32,
+                device=input_ids.device
+            ) * input_ids.size(1)
+        if seqlens is not None:
+            attention_mask = None  # unset attention_mask to save memory
+        if seqlens is None and attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+        if attention_mask is not None:
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+            )
+        # embed positions
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, False, group_index)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    seqlens,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    seqlens=seqlens,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    group_index=group_index,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class NormHead(nn.Module):
+    def __init__(self, hidden_size, vocab_size, bias=False):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.weight = nn.Parameter(torch.empty((self.vocab_size, self.hidden_size)))
+        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+    def forward(self, hidden_states, mask=None):
+        norm_weight = nn.functional.normalize(self.weight)
+        if mask is not None:
+            mask = mask.to(norm_weight)
+            norm_weight = norm_weight * mask + (1 - mask) * norm_weight.detach()
+        return nn.functional.linear(hidden_states, norm_weight)
+    def extra_repr(self) -> str:
+        return f'in_features={self.hidden_size}, out_features={self.vocab_size}'
+@dataclass
+class BaichuanMMCausalLMOutputWithPast(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    text_nt_loss: Optional[torch.FloatTensor] = None
+    flatten_loss: Optional[torch.FloatTensor] = None
+class BaichuanForCausalLM(BaichuanPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.model = BaichuanModel(config)
+        if config.use_norm_head:
+            self.lm_head = NormHead(config.hidden_size, config.vocab_size, bias=False)
+        else:
+            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def bind_processor(self, tokenizer, **kwargs):
+        self.processor = BaichuanMMProcessor(
+                tokenizer=tokenizer,
+                config=self.config,
+                **kwargs,
+                )
+        return self.processor
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        seqlens: Optional[torch.IntTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        audios: Optional[List|torch.Tensor] = None,
+        encoder_length: Optional[torch.Tensor] = None,
+        bridge_length: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        images_grid: Optional[torch.Tensor] = None,
+        videos: Optional[torch.Tensor] = None,
+        videos_grid: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        _, sp_mask, _ = self.model.get_multimodal_mask(input_ids, self.config.audio_config.audio_pad_token_id, self.config.multimodal_special_token_list)
+        # TODO 放开部分可学习的special token lmhead参数
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            seqlens=seqlens,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            audios=audios,
+            encoder_length=encoder_length,
+            bridge_length=bridge_length,
+            images=images,
+            images_grid=images_grid,
+            videos=videos,
+            videos_grid=videos_grid,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        # 部分可学习的special token放开lm head梯度
+        special_with_loss_list = list(set(self.config.multimodal_special_token_list) - set(self.config.multimodal_special_token_no_loss_list))
+        _, sp_with_loss_mask, lm_head_mask = self.model.get_multimodal_mask(input_ids, self.config.audio_config.audio_pad_token_id, special_with_loss_list)
+        if self.config.train_multimodal_special_tokens_only and self.training and len(special_with_loss_list) > 0:
+            if self.config.use_norm_head:
+                logits = self.lm_head(hidden_states, mask=lm_head_mask)
+            else:
+                lm_head_mask = lm_head_mask.to(self.lm_head.weight)
+                norm_weight = self.lm_head.weight * lm_head_mask + (1 - lm_head_mask) * self.lm_head.weight.detach()
+                logits = torch.einsum('bsd,ld->bsl', hidden_states, norm_weight)
+        else:
+            logits = self.lm_head(hidden_states)
+        loss = torch.tensor(0, device=hidden_states.device, dtype=hidden_states.dtype)
+        text_nt_loss = torch.tensor(0, device=hidden_states.device, dtype=hidden_states.dtype)
+        flatten_loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            valid_mask = torch.gt(shift_labels, -1)  # label < 0 视为pad位置
+            sp_mask = sp_mask[..., 1:].contiguous()
+            text_mask = torch.logical_and(valid_mask, torch.logical_not(sp_mask))
+            # Flatten the tokens
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            flatten_loss = F.cross_entropy(shift_logits, shift_labels, ignore_index=-100, reduction='none')
+            loss = torch.mean(torch.masked_select(flatten_loss, valid_mask.view(-1)))
+            text_nt_loss = torch.mean(torch.masked_select(flatten_loss, text_mask.view(-1))).detach()
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return BaichuanMMCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            text_nt_loss=text_nt_loss,
+            flatten_loss=flatten_loss
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        elif past_key_values is not None:
+            model_inputs = {"input_ids": input_ids}
+        else:
+            model_inputs = {"input_ids": input_ids,
+                            "audios": kwargs.get("audios", None), "encoder_length": kwargs.get("encoder_length", None), "bridge_length": kwargs.get("bridge_length", None),
+                            "images": kwargs.get("images", None),
+                            "videos": kwargs.get("videos", None)
+                            }
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "images_grid":  kwargs.get("images_grid"),
+                "videos_grid":  kwargs.get("videos_grid"),
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+    def chat(self, tokenizer, messages: List[dict], stream=False,
+             generation_config: Optional[GenerationConfig]=None):
+        generation_config = generation_config or self.generation_config
+        input_ids = build_chat_input(self, tokenizer, messages, generation_config.max_new_tokens)
+        if stream:
+            streamer = TextIterStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+            Thread(target=self.generate, kwargs=dict(
+                inputs=input_ids, streamer=streamer,
+                generation_config=generation_config,
+            )).start()
+            return streamer
+        else:
+            outputs = self.generate(input_ids, generation_config=generation_config)
+            response = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
+            return response

moe.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import torch
+from torch.nn import functional as F
+# @torch.compile
+def moe_matmul(inputs, weight_list, group_index, linear_fn=lambda x, y: torch.matmul(x, y)):
+    """
+    inputs: tensor (bs, sl, dim)
+    weight_list: MoE weights, list of [(dim, dim')]
+    group_index: (bs, sl), max(group_index) + 1 == len(weight_list), 在sl维上表示分组信息
+    group_nums: 表示MoE的个数
+    example:
+        拉平后bs*sl的group index 0 0 0 1 1 1 0 0 1 1 1 0 0 0 1 1 1  (17)
+        按0, 1 分别正反编码index
+        0:
+        cumsum: 0 1 2 2 2 2 3 4 4 4 4 5 6 7 7 7 7
+        offset: same
+        mask:   0 1 2 0 0 0 3 4 0 0 0 5 6 7 0 0 0
+        new offset is 7
+        1:
+        cumsum: 0 0 0 1 2 3 3 3 4 5 6 6 6 6 7 8 9
+        offset: 7 7 7 8 9 10 10 10 11 12 13 13 13 13 14 15 16
+        mask:   0 0 0 8 9 10 0 0 11 12 13 0 0 0 14 15 16
+        new offset is 16
+        ...
+        合并encode映射码表
+        0 1 2 8 9 10 3 4 11 12 13 5 6 7 14 15 16
+        执行gather操作，之后将inputs按offset split 分别matmul 再concat
+        decode映射码表
+        0 1 2 8 9 10 3 4 11 12 13 5  6  7  14 15 16  index
+        0 1 2 3 4 5  6 7 8  9  10 11 12 13 14 15 16  value
+        :
+        0 1 2 6 7 11 12 13 3 4 5 8 9 10 14 15 16
+    """
+    bs, sl = group_index.size()
+    group_inputs, cur_offset, group_encode_index = [], 0, 0
+    for group_i in range(len(weight_list)):
+        group_i_mask = torch.eq(group_index.to(torch.int32), group_i).view(bs * sl)  # (bs * sl)
+        group_inputs.append(linear_fn(
+            torch.masked_select(inputs, group_i_mask.view(bs, sl, 1)).view(-1, inputs.size(-1)),
+            weight_list[group_i]))  # (?, dims) X (dims, dims')
+        group_i_index = torch.cumsum(group_i_mask.view(bs * sl).to(torch.int64), axis=0)
+        group_i_index -= 1 if group_i == 0 else 0 # 下标从0开始 只需要在第一个分组处理
+        group_i_index = (cur_offset + group_i_index) * group_i_mask
+        cur_offset = torch.max(group_i_index)
+        group_encode_index += group_i_index
+    group_decode_index = torch.gather(torch.arange(0, bs * sl, step=1, dtype=torch.int64, device=inputs.device), 0, group_encode_index)
+    group_inputs = torch.cat(group_inputs, axis=0)  # (bs * sl, dims')
+    outputs = torch.index_select(group_inputs, 0, group_decode_index).view(bs, sl, -1)
+    return outputs
+if __name__ == "__main__":
+    bs, sl, d = 13, 997, 97
+    dtype = torch.bfloat16
+    inputs = torch.tensor(torch.randn([bs, sl, d], dtype=dtype).cuda(), requires_grad=True)
+    group_num = 2
+    # group_index = torch.remainder(torch.randint(0, 6, (bs, sl)), group_num).cuda()
+    group_index = torch.remainder(torch.randint(0, 6, (bs, sl)), 1).cuda()
+    weights = [torch.tensor(torch.eye(d).cuda().to(dtype), requires_grad=True) for _ in range(group_num)]
+    output = moe_matmul(inputs, weights, group_index)
+    print(inputs - output)
+    loss = torch.sum(output * (group_index+1).to(dtype).view(bs, sl, 1))
+    print(loss)
+    loss.backward()
+    print(inputs.grad[:, :, 0] - group_index.to(dtype))
+    print(weights[-1].grad)

processor_baichuan.py ADDED Viewed

	@@ -0,0 +1,1154 @@

+import requests
+import re, ujson, os, sys, fire, glob, random, time, json
+import numpy as np
+import io
+import torch
+from torch.utils.data import default_collate
+import torchaudio
+from typing import *
+from dataclasses import dataclass, field
+import transformers
+from transformers.modeling_outputs import ModelOutput
+from transformers.audio_utils import mel_filter_bank, spectrogram, window_function
+from functools import lru_cache
+from io import BytesIO
+from PIL import Image
+from qcloud_cos import CosConfig
+from qcloud_cos import CosS3Client
+import tos
+import concurrent.futures as cf
+from transformers.image_transforms import resize, center_crop, get_resize_output_image_size
+from transformers.image_utils import PILImageResampling
+from PIL import Image, ImageOps
+from PIL import ImageFile
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+import base64
+from decord import VideoReader, cpu
+import cv2
+import av
+import imagesize
+import math
+def smart_resize(
+    height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
+):
+    """Rescales the image so that the following conditions are met:
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    # if height < factor or width < factor:
+        # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor if height > factor else factor
+    w_bar = round(width / factor) * factor if width > factor else factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+def select_best_resolution(image_size, candidate_resolutions):
+    '''找到最佳的resolution 对于原图进行放缩
+        image_size 通常为ori_size e.g. (8*336, 16*336)
+        candidate_resolutions 为备选分辨率 e.g. (1*336, 4*336)
+    '''
+    try:
+        original_width, original_height = image_size
+    except:
+        pass
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float("inf")
+    # 从candidate_resolutions 中遍历宽和高
+    for width, height in candidate_resolutions:
+        # width / original_width 和 height / original_height 中最小的那个作为scale
+        scale = min(width / original_width, height / original_height) # e.g. scale =min (1/8, 1/4) = 1/8
+        # 放缩 original_width 和 original_height
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale) # e.g. 1*336, 2*336
+        # effective_resolution 为 放缩之后的分辨率 s^2 * w * h
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height) # e.g. min(1*336 * 2*336, 8*336 * 16*336)
+        # wasted_resolution 为 放缩前后分辨率的差值
+        wasted_resolution = (width * height) - effective_resolution
+        # 若 （1） 放缩之后的分辨率 比当前的max_effective_resolution更大;
+            # (2)
+        if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution # 更新max_effective_resolution
+            min_wasted_resolution = wasted_resolution # min_wasted_resolution
+            best_fit = (width, height)
+    return best_fit
+def read_video(image_path, max_frame_number, decode_way):
+    if decode_way=='1fps':
+        try:
+            vr = VideoReader(image_path, ctx=cpu(0))
+            total_frame_num = len(vr)
+            fps = round(vr.get_avg_fps())
+            frame_idx = [i for i in range(0, len(vr), fps)]
+            frames = vr.get_batch(frame_idx).asnumpy()
+            frames = [i for i in frames]
+            cnt = len(frames)
+        except Exception as e:
+            print(image_path)
+            print('error is', e)
+            return None
+    elif decode_way=='key':
+        try:
+            with av.open(image_path) as container:
+                stream = container.streams.video[0]
+                stream.codec_context.skip_frame = 'NONKEY'
+                frames = []
+                fps = int(stream.average_rate)
+                cnt = 0
+                for frame in container.decode(stream): # 关键帧存成image patch
+                    image = frame.to_image()
+                    frames.append(image)
+                    cnt += 1
+        except Exception as e:
+            print('error is', e)
+            return None
+    if frames is None or len(frames)==0:
+        return None
+    if len(frames)>max_frame_number and max_frame_number>0:
+        # 生成均匀间隔的索引
+        indices = np.linspace(0, len(frames) - 1, max_frame_number, dtype=int)
+        # 根据索引获取对应元素
+        sampled_elements = [frames[idx] for idx in indices]
+        frames = sampled_elements
+    return frames
+class BaichuanImageProcessor:
+    def __init__(self, config, **kwargs):
+        self.config = config  # visual_config
+        self.min_pixels = self.config.min_pixels if hasattr(self.config, 'min_pixels') else 56 * 56
+        self.max_pixels = self.config.max_pixels if hasattr(self.config, 'max_pixels') else 28 * 28 * 1280
+        self.patch_size = self.config.patch_size if hasattr(self.config, 'patch_size') else 14
+        self.temporal_patch_size = self.config.temporal_patch_size if hasattr(self.config, 'temporal_patch_size') else 2
+        self.merge_size = self.config.merge_size if hasattr(self.config, 'merge_size') else 2
+        self.spatial_merge_size = self.config.spatial_merge_size if hasattr(self.config, 'spatial_merge_size') else 2
+    def image_transform(self, strseq, return_mm_data = True):
+        image = None
+        if isinstance(strseq, str):
+            if return_mm_data:
+                image = Image.open(strseq).convert("RGB")
+        else:
+            image = Image.open(BytesIO(strseq)).convert("RGB")
+        image = np.array(image.convert("RGB")) # 这一步首先将图像转换为 RGB 格式，确保图像有三个通道（R、G、B）。然后使用 np.array() 将其转换为 NumPy 数组，方便后续处理。
+        image_org_size = image.shape[:2] # 这里保存了图像的原始大小（高度和宽度），image.shape 返回图像的形状 (高度, 宽度, 通道数)，而 image.shape[:2] 提取了前两个值，即原始的高度和宽度。这个信息可以用于后续的对比或其他处理。
+        # resize, crop, scale, normalize
+        # 接受目标尺寸作为输入参数，通常是目标尺寸的短边或长边长度。例如，如果指定目标短边为 336 像素，函数会自动计算出对应的长边大小，以保持图像的宽高比。
+        # 输出一个新的尺寸，这个尺寸通常是 (宽度, 高度) 格式，用于后续的图像调整操作，如缩放或裁剪。
+        resized_height, resized_width = smart_resize(
+            image_org_size[0], image_org_size[1],
+            factor=self.patch_size * self.spatial_merge_size,
+            min_pixels=self.min_pixels,
+            max_pixels=self.max_pixels,
+        )
+        output_size = (resized_height, resized_width)
+        # output_size = get_resize_output_image_size(image, self.config.crop_size, False)  # 短边resize到336
+        # 使用 resize 函数将图像调整到 output_size 大小。PILImageResampling.BICUBIC 指定使用双三次插值法来进行图像缩放，这种方法通常能够提供较好的图像质量。
+        # image: 输入的图像数据，可以是 NumPy 数组或 PIL 图像对象；output_size: 目标大小，通常是一个二元组 (宽度, 高度)。这个尺寸可以是图像的绝对大小，也可以是相对于原始图像的比例；
+        # resample: 可选的重采样方法，通常用于确定如何插值像素。例如，PILImageResampling.BICUBIC 表示使用双三次插值法，这是一种平滑的插值方法，常用于图像缩放。
+        image = resize(image, output_size, PILImageResampling.BICUBIC)
+        # 从图像中心裁剪出一个指定大小的区域，这里是一个正方形区域 self.config.crop_size x self.config.crop_size。center_crop 函数的参数 return_numpy=True 表示返回一个 NumPy 数组形式的裁剪图像。
+        # image = center_crop(image, (self.config.crop_size, self.config.crop_size), return_numpy=True)
+        img = image.transpose(2, 0, 1)
+        # 对图像进行归一化和标准化处理
+        image = (img / 255.0 - np.array(self.config.image_mean)[:, np.newaxis, np.newaxis]) / np.array(self.config.image_std)[:,np.newaxis,np.newaxis]
+        # 处理成patch
+        patches = image[np.newaxis, :]
+        if patches.shape[0] == 1:
+            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // self.temporal_patch_size
+        grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
+        patches = patches.reshape(
+            grid_t,
+            self.temporal_patch_size,
+            channel,
+            grid_h // self.spatial_merge_size,
+            self.spatial_merge_size,
+            self.patch_size,
+            grid_w // self.spatial_merge_size,
+            self.spatial_merge_size,
+            self.patch_size,
+        )
+        patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
+        flatten_patches = patches.reshape(
+            grid_t * grid_h * grid_w, channel * self.temporal_patch_size * self.patch_size * self.patch_size
+        )
+        return flatten_patches, image_org_size, (grid_t, grid_h, grid_w)
+class BaichuanAudioProcessor:
+    # 包含基本的音频特征抽取模块 + 输入数据解析模块 + cos请求/缓存模块
+    def __init__(
+        self,
+        config,  # audio processor config
+        **kwargs
+    ):
+        # make sure you have install 'conda install -c conda-forge 'ffmpeg<7'' for torchaudio
+        assert(len(torchaudio.list_audio_backends()) > 0)
+        self.config = config
+        self.mel_filters = mel_filter_bank(
+            num_frequency_bins=1 + self.config.n_fft // 2,
+            num_mel_filters=self.config.num_mel_bins,
+            min_frequency=0.0,
+            max_frequency=self.config.sampling_rate / 2.0,
+            sampling_rate=self.config.sampling_rate,
+            norm="slaney",
+            mel_scale="slaney",
+        )
+    @staticmethod
+    def zero_mean_unit_var_norm(x):
+        return (x - x.mean()) / torch.sqrt(x.var() + 1e-8)
+    def load_audio_waveform(self, uri, return_tensors=True, do_normalize=False):
+        metadata = torchaudio.info(uri)  # sample_rate, num_frames, num_channels, bits_per_sample, encoding=PCM_S
+        assert(metadata.num_channels <= 2), "acoustic file with {} channels.".format(metadata.num_channels)  # whisper only accept mono channel audio
+        waveform_tensor, _ = torchaudio.load(uri, normalize=True)
+        if self.config.sampling_rate != metadata.sample_rate:
+            waveform_tensor = torchaudio.functional.resample(waveform_tensor, metadata.sample_rate, self.config.sampling_rate)
+        # downmix to mono channel https://trac.ffmpeg.org/wiki/AudioChannelManipulation
+        if metadata.num_channels > 1:
+            waveform_tensor = torch.mean(waveform_tensor, dim=0, keepdim=True)
+        # normalized to zero mean (Qwen Audio没有处理 但Whisper官方实现)
+        if do_normalize:
+            waveform_tensor = self.zero_mean_unit_var_norm(waveform_tensor)
+        if return_tensors:  # (channels, samples)
+            return waveform_tensor
+        else:
+            return waveform_tensor.numpy()
+    def split_with_overlap(self, waveform):  # 如果长度超过最大长度限制 分割为带overlap的多段
+        channels, wave_samples = waveform.shape
+        max_audio_samples = self.config.max_audio_seconds * self.config.sampling_rate
+        if wave_samples <= max_audio_samples or self.config.split_overlap < 0:
+            return [waveform]  # 没有超出最大长度or截断逻辑 统一返回list
+        split_waveform, start = [], 0
+        while start < wave_samples:  # 20240724修改 统一按秒数对齐overlap 保证不同sampling rate/n_fft/hop length配置下采到的数据是一致的
+            if start > int(self.config.sampling_rate * self.config.split_overlap):
+                start -= int(self.config.sampling_rate * self.config.split_overlap)  # 0表示没有overlap，>0 overlap对应秒数
+            end = min(start + max_audio_samples, wave_samples)
+            split_waveform.append(waveform[:, start:end])  # 注意这里可能会切割出特别短的片段 需要在预处理判断并丢弃
+            start = end
+        return split_waveform
+    @classmethod
+    def inference_output_length(cls, config, input_length):
+        # for whisper + bridge
+        kernel_size = config.kernel_size
+        stride_size = config.stride_size
+        avg_pooler = config.avg_pooler
+        encoder_length = (input_length + 2 * (kernel_size // 2) - kernel_size) // 1 + 1  # conv layer1 with pad=1
+        encoder_length = (encoder_length + 2 * (kernel_size // 2) - kernel_size) // stride_size + 1  # conv layer2 with pad=1
+        if avg_pooler > 1:
+            bridge_length = encoder_length // avg_pooler
+        return encoder_length, bridge_length
+    def extract_fbank_features(self, waveform):
+        # ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/feature_extraction_whisper.py
+        channels, wave_samples = waveform.shape
+        assert(wave_samples >= self.config.n_fft)
+        valid_frame_nums = min(self.config.max_audio_seconds * self.config.sampling_rate // self.config.hop_length, wave_samples // self.config.hop_length + 1)
+        if wave_samples < self.config.max_audio_seconds * self.config.sampling_rate:
+            waveform = torch.nn.functional.pad(waveform, (0, self.config.max_audio_seconds * self.config.sampling_rate - wave_samples), "constant", 0)
+        else:
+            waveform = waveform[:, :self.config.max_audio_seconds * self.config.sampling_rate]
+        window = torch.hann_window(self.config.n_fft)
+        stft = torch.stft(waveform, self.config.n_fft, self.config.hop_length, window=window, return_complex=True)  # fft, len(wave) // n_fft // 2 + 1
+        magnitudes = stft[..., :-1].abs() ** 2
+        mel_filters = torch.from_numpy(self.mel_filters).type(torch.float32)
+        mel_spec = mel_filters.T @ magnitudes
+        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+        if waveform.dim() == 2:
+            max_val = log_spec.max(dim=2, keepdim=True)[0].max(dim=1, keepdim=True)[0]
+            log_spec = torch.maximum(log_spec, max_val - 8.0)
+        else:
+            log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+        log_spec = log_spec[0].numpy()  # (channel, filters, samples) -> (filters, samples)
+        log_spec[:, valid_frame_nums:] = 0.0  # pad0 在collect时取batch内最大长度
+        return log_spec, valid_frame_nums
+    def data_augment(self, feature: np.array, input_length, training=True):
+        # reference https://arxiv.org/pdf/1904.08779
+        # run only on cpu
+        def mask_start_indices(input_length, mask_length, min_masks, mask_prob):
+            # 计算总共需要mask的span数 之后随机筛选span开始下标
+            num_masked_span = int(mask_prob * input_length / mask_length + random.random())
+            num_masked_span = max(num_masked_span, min_masks)
+            start_indices = list(range(input_length - mask_length))
+            random.shuffle(start_indices)
+            start_indices = start_indices[:num_masked_span]
+            return start_indices
+        if not training or (self.config.mask_time_prob <= 0 and self.config.mask_feature_prob <= 0):
+            return feature
+        if input_length < self.config.mask_time_length * self.config.mask_time_min_masks + 1:
+            return feature
+        if self.config.num_mel_bins < self.config.mask_feature_length * self.config.mask_feature_min_masks + 1:
+            return feature
+        if self.config.mask_time_prob > 0:
+            start_indices = mask_start_indices(input_length, self.config.mask_time_length, self.config.mask_time_min_masks, self.config.mask_time_prob)
+            for start_idx in start_indices:
+                feature[:, start_idx: start_idx + self.config.mask_time_length] = 0.0
+        if self.config.mask_feature_prob > 0:
+            start_indices = mask_start_indices(self.config.num_mel_bins, self.config.mask_feature_length, self.config.mask_feature_min_masks, self.config.mask_feature_prob)
+            for start_idx in start_indices:
+                feature[start_idx: start_idx + self.config.mask_feature_length, :] = 0.0
+        return feature
+class CosClient():
+    def __init__(self, bucket_name='crawl-pic-1317568651',
+            max_retries=2):
+        self.config = CosConfig(
+            Endpoint="cos.ap-guangzhou.myqcloud.com",
+            # Region='ap-guangzhou',
+            SecretId='AKIDnRpxoOghgVs0tkU3Mfv20jAMI0SRDj02',
+            SecretKey='td9tRlqiPvEJ8i27wXwBIDiy5ye6JGyS',
+            Token=None, Scheme='https', Timeout=300)
+        self.client = CosS3Client(self.config)
+        self.max_retries = max_retries
+        self.bucket_name = bucket_name
+    def __call__(self, relative_path, bucket_name=None):
+        if bucket_name is None or len(bucket_name) <= 0:
+            bucket_name = self.bucket_name
+        multimodal_bytes = None
+        for _ in range(self.max_retries):
+            try:
+                response = self.client.get_object(Bucket=bucket_name, Key=relative_path)
+                fp = response['Body'].get_raw_stream()
+                multimodal_bytes = fp.read()
+                break
+            except Exception as e:
+                time.sleep(0.01)
+                continue
+        return multimodal_bytes
+class TosClient(object):
+    def __init__(self):
+        ak = "AKLTYTM3MWY5MTFhNDgyNDk4YjhmYTE0ZTE3YTk5ZmU1MjU"
+        sk = "TVRRM1pUZGtaVEJqWTJJd05HSTNPR0ppWVdKa1lqYzVORFUwTlRobU1UVQ=="
+        endpoint = "tos-cn-beijing.ivolces.com"  # "tos-cn-beijing.ivolces.com"
+        region = "cn-beijing"
+        self.bucket_name = "audio-dataset"
+        self.client = tos.TosClientV2(ak, sk, endpoint, region)
+    def __call__(self, path, bucket_name=None):
+        if bucket_name is None:
+            bucket_name = self.bucket_name
+        for _ in range(2):
+            try:
+                object_stream = self.client.get_object(bucket_name, path)
+                return object_stream.read()
+            except Exception as e:
+                time.sleep(0.01)
+                continue
+        return None
+@dataclass
+class BaichuanProcessorOutput(ModelOutput):
+    input_ids: Optional["List|torch.Tensor"] = None
+    labels: Optional["List|torch.Tensor"] = None
+    attention_mask: Optional["List|torch.Tensor"] = None
+    position_ids: Optional["List|torch.Tensor"] = None
+    seqlens: Optional["List|torch.Tensor"] = None  # 需要配合Baichuan Modeling使用
+    # audio fields
+    audios: Optional["List|torch.Tensor"] = None
+    encoder_length: Optional["List|torch.Tensor"] = None
+    bridge_length: Optional["List|torch.Tensor"] = None
+    # image fields
+    images: Optional["List|torch.Tensor"] = None
+    patch_nums: Optional["List|torch.Tensor"] = None
+    images_size: Optional["List|torch.Tensor"] = None
+    crop_size: Optional["List|torch.Tensor"] = None
+    images_grid: Optional["List|torch.Tensor"] = None
+    # video fields
+    videos: Optional["List|torch.Tensor"] = None
+    videos_patch_nums: Optional["List|torch.Tensor"] = None
+    videos_size: Optional["List|torch.Tensor"] = None
+    videos_crop_size: Optional["List|torch.Tensor"] = None
+    videos_grid: Optional["List|torch.Tensor"] = None
+    # processor fields
+    raw_text: Optional[str] = None
+    index: Optional[int] = None
+    def concatenate(self, other):  # 仅限list使用
+        def concat_one(a, b):
+            if a is None and b is None:
+                return None
+            elif a is None and b is not None:
+                return b
+            elif a is not None and b is None:
+                return a
+            else:
+                return a + b
+        return BaichuanProcessorOutput(
+            input_ids=concat_one(self.input_ids, other.input_ids),
+            labels=concat_one(self.labels, other.labels),
+            audios=concat_one(self.audios, other.audios),
+            encoder_length=concat_one(self.encoder_length, other.encoder_length),
+            bridge_length=concat_one(self.bridge_length, other.bridge_length),
+            images=concat_one(self.images, other.images),
+            images_grid=concat_one(self.images_grid, other.images_grid),
+            patch_nums=concat_one(self.patch_nums, other.patch_nums),
+            videos=concat_one(self.videos, other.videos),
+            videos_grid=concat_one(self.videos_grid, other.videos_grid),
+            videos_patch_nums=concat_one(self.videos_patch_nums, other.videos_patch_nums),
+            position_ids=concat_one(self.position_ids, other.position_ids),
+            seqlens=concat_one(self.seqlens, other.seqlens),
+            images_size=concat_one(self.images_size, other.images_size)
+        )
+class BaichuanMMProcessor(object):
+    def __init__(self,
+                tokenizer: transformers.PreTrainedTokenizer,
+                config,
+                training,
+                relative_path=None,
+                **kwargs,
+    ):
+        self.tokenizer = tokenizer
+        self.config = config
+        self.audio_processor = None
+        if hasattr(config, "audio_config"):
+            self.audio_processor = BaichuanAudioProcessor(config.audio_config)
+        self.visual_processor = None
+        if hasattr(config, "visual_config"):
+            self.visual_processor = BaichuanImageProcessor(config.visual_config)
+        self.video_processor = None
+        if hasattr(config, "video_config"):
+            self.video_processor = BaichuanImageProcessor(config.video_config)
+        self.training = training
+        self.relative_path = relative_path
+        self.cos_client = CosClient()
+        self.tos_client = TosClient()
+        # audio tag
+        self.audio_start_tag = None
+        self.audio_end_tag = None
+        self.audio_pad_tag = None
+        self.audio_delim_tag = None
+        if hasattr(self.config, "audio_config"):
+            self.audio_start_tag = self.tokenizer.convert_ids_to_tokens(self.config.audio_config.audio_start_token_id)
+            self.audio_end_tag = self.tokenizer.convert_ids_to_tokens(self.config.audio_config.audio_end_token_id)
+            self.audio_pad_tag = self.tokenizer.convert_ids_to_tokens(self.config.audio_config.audio_pad_token_id)
+            self.audio_delim_tag = self.tokenizer.convert_ids_to_tokens(self.config.audio_config.audio_delim_token_id)
+        # image tag
+        self.image_start_tag = None
+        self.image_end_tag = None
+        self.image_pad_tag = None
+        self.video_start_tag = None
+        self.video_end_tag = None
+        if hasattr(self.config, "visual_config"):
+            # special token for start_tag
+            self.image_start_tag = self.tokenizer.convert_ids_to_tokens(self.config.visual_config.image_start_token_id)
+            # special token for end_tag
+            self.image_end_tag = self.tokenizer.convert_ids_to_tokens(self.config.visual_config.image_end_token_id)
+            # special token for pad_tag
+            self.image_pad_tag = self.tokenizer.convert_ids_to_tokens(self.config.visual_config.image_pad_token_id)
+            self.image_line_tag = self.tokenizer.convert_ids_to_tokens(self.config.visual_config.image_line_token_id)
+            self.image_delimiter_tag = self.tokenizer.convert_ids_to_tokens(self.config.visual_config.image_delimiter_token_id)
+        if hasattr(self.config, "video_config"):
+            self.video_start_tag = self.tokenizer.convert_ids_to_tokens(self.config.video_config.video_start_token_id)
+            self.video_end_tag = self.tokenizer.convert_ids_to_tokens(self.config.video_config.video_end_token_id)
+            self.image_start_tag = self.tokenizer.convert_ids_to_tokens(self.config.video_config.image_start_token_id)
+            self.image_end_tag = self.tokenizer.convert_ids_to_tokens(self.config.video_config.image_end_token_id)
+            self.image_pad_tag = self.tokenizer.convert_ids_to_tokens(self.config.video_config.image_pad_token_id)
+            self.video_place_tag = self.tokenizer.convert_ids_to_tokens(self.config.video_config.video_place_token_id)
+    # @lru_cache(maxsize=1024)
+    def _get_audio(self, audio_info, return_mm_data = True):
+        try:
+            audio_info = ujson.loads(audio_info)
+            audio_uri = None
+            if 'path' in audio_info.keys():
+                if self.relative_path is not None: # 优先匹配本地路径
+                    audio_uri = os.path.join(self.relative_path, audio_info['path'])
+                    if not os.path.exists(audio_uri):
+                        audio_uri = None
+                if audio_uri is None:  # 本地没有尝试取cos/tos
+                    if audio_info.get('server', 'cos') == 'tos':
+                        audio_uri = self.tos_client(audio_info['path'], 'audio-dataset')
+                    else:
+                        audio_uri = self.cos_client(audio_info['path'], 'audio-data-1317568651')
+            elif 'local' in audio_info.keys():
+                audio_uri = audio_info['local']
+                if not os.path.exists(audio_uri):
+                    audio_uri = None
+                    return BaichuanProcessorOutput()
+            else:
+                raise ValueError("can not find path or local in audio_info")
+            waveforms = self.audio_processor.load_audio_waveform(audio_uri, True)
+            waveforms = self.audio_processor.split_with_overlap(waveforms)  # 分割逻辑
+            ret = BaichuanProcessorOutput()  # 默认初始化 audios字段为None
+            for waveform in waveforms:
+                audio, input_length = self.audio_processor.extract_fbank_features(waveform)
+                audio = self.audio_processor.data_augment(audio, input_length, self.training)
+                encoder_length, bridge_length = self.audio_processor.inference_output_length(self.config.audio_config, input_length)
+                if bridge_length <= 0:  # 过滤极端短数据 1. 如果len(waveforms)==1 ret=None; 2. len(waveforms)>1 则说明最后一段太短被抛弃
+                    continue
+                current_ret = BaichuanProcessorOutput(
+                    audios=[audio],
+                    encoder_length=[encoder_length],
+                    bridge_length=[bridge_length])
+                if ret.audios is None:
+                    ret = current_ret
+                else:
+                    ret = ret.concatenate(current_ret)  # 拼接多个切片
+            if not return_mm_data:
+                ret.audios = [None]
+            return ret
+        except Exception as e:
+            print("**** get audio error: {}, info: {} *****".format(str(e), str(audio_info)))
+        return BaichuanProcessorOutput()
+    # @lru_cache(maxsize=1024)
+    def _get_image(self, image_info, return_mm_data = True):
+        try:
+            try: # chensong
+                image_info = ujson.loads(image_info)
+            except:
+                #image_info = image_info.replace("'", '"')
+                image_info = re.sub(r"(?<!\\)'", '"', image_info)
+                image_info = ujson.loads(image_info)
+            if 'base64' in image_info.keys():
+                image_data = base64.b64decode(image_info['base64'])
+                image_feat, org_size, image_list = self.visual_processor.image_transform(image_data)
+            elif 'local' in image_info.keys():
+                image_feat, org_size, image_list = self.visual_processor.image_transform(image_info['local'],return_mm_data = return_mm_data)
+            elif 'path' in image_info.keys():
+                if "tos_bucket" in image_info.keys(): # tos上的每个item，一定要写明tos的桶以及tos_bucket这个key
+                    tos_bucket = image_info['tos_bucket']
+                    image_bytes = self.tos_client(image_info['path'], tos_bucket) # 从cos_client 获得 image
+                else:
+                    cos_bucket = None
+                    if "cos_bucket" in image_info.keys():
+                        cos_bucket = image_info['cos_bucket']
+                    if "bucket_name" in image_info.keys():
+                        cos_bucket = image_info['bucket_name']
+                    image_bytes = self.cos_client(image_info['path'], cos_bucket) # 从cos_client 获得 image
+                # 获得image_feat(image patches), org_size(image最初的size), image_list
+                image_feat, org_size, image_list = self.visual_processor.image_transform(image_bytes)
+            else:
+                raise ValueError("can not find any path in image_info")
+            merge_length = self.visual_processor.merge_size**2
+            patch_nums = np.array(image_list).prod() // merge_length
+            if org_size[0] * org_size[1] > 16**2:  # 极端小的图过滤
+                return BaichuanProcessorOutput(
+                        images=[image_feat],
+                        patch_nums=[patch_nums],
+                        crop_size=[image_list],
+                        images_size= [org_size],
+                        images_grid=[image_list]
+                        )
+            else:
+                print("**** image too small: {}, info: {} *****".format(str(org_size), str(image_info)))
+                return BaichuanProcessorOutput()
+        except Exception as e:
+            print("**** get image error: {}, info: {} *****".format(str(e), str(image_info)))
+        return BaichuanProcessorOutput()
+    # @lru_cache(maxsize=1024)
+    def _get_video_frame(self, video_frame_info, return_mm_data = True):
+        try:
+            pattern = r'\{.*?\}'
+            matches = re.findall(pattern, video_frame_info)
+            ret = BaichuanProcessorOutput()
+            # 逐个解析
+            for match in matches:
+                video_frame_info = ujson.loads(match)
+                if 'local' in video_frame_info.keys():
+                    image_feat, org_size, image_list = self.video_processor.image_transform(video_frame_info['local'],return_mm_data = return_mm_data)
+                else:
+                    raise ValueError("can not find any path in image_info")
+                merge_length = self.video_processor.merge_size**2
+                patch_nums = np.array(image_list).prod() // merge_length
+                if org_size[0] * org_size[1] > 16**2:  # 极端小的图过滤
+                    ret = ret.concatenate(
+                            BaichuanProcessorOutput(
+                                videos=[image_feat],
+                                videos_patch_nums=[patch_nums],
+                                videos_crop_size=[image_list],
+                                videos_size= [org_size],
+                                videos_grid=[image_list]
+                            )
+                        )
+                else:
+                    print("**** video too small: {}, info: {} *****".format(str(org_size), str(video_frame_info)))
+            return ret
+        except Exception as e:
+            print("**** get video error: {}, info: {} *****".format(str(e), str(video_frame_info)))
+        return BaichuanProcessorOutput()
+    # 读取视频
+    def _get_video_obj_byte(self, source, path, video_obj_json):
+        video_obj_byte = None
+        if source == "cos":
+            start_time = time.time()
+            video_obj_byte = self.cos_client(path, bucket_name=video_obj_json.get("cos_bucket", None))
+            if (time.time() - start_time) > 1.0:
+                self.reflash_cos_client()
+        if source == "local":
+            if os.path.exists(path):
+                video_obj_byte = open(path, "rb").read()
+            else:
+                video_obj_byte = None
+        if source == "base64":
+            video_obj_byte = base64.b64decode(path)
+        if source == "url":
+            video_obj_byte = requests.get(url=path).content
+        return video_obj_byte
+    # 将视频切分为帧，保存至子目录中
+    def _split_video_to_frames(self, video_info, max_frame_number=-1, decode_way="1fps"):
+        video_path = video_info['local']
+        # 帧保存本地路径
+        frame_path = video_path.split('.')[0] + '_frames'
+        if not os.path.exists(frame_path) or len(os.listdir(frame_path))==0:
+            # 保存帧
+            os.makedirs(frame_path, exist_ok=True)
+            mm_obj_byte = self._get_video_obj_byte('local', video_path, video_info)
+            if mm_obj_byte is None: # 未读取到视频文件
+                return ""
+            frames = read_video(io.BytesIO(mm_obj_byte), max_frame_number=max_frame_number, decode_way=decode_way) #读取全部帧
+            for frame_idx, frame in enumerate(frames):
+                output_filename = os.path.join(frame_path, f"{frame_idx}.jpg")
+                frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+                cv2.imwrite(output_filename, frame)
+        # 选取帧
+        frame_number = len([filename for filename in os.listdir(frame_path) if filename.endswith('.jpg')])
+        if frame_number>max_frame_number:
+            indices = np.linspace(0, frame_number - 1, max_frame_number, dtype=int)
+        else:
+            indices = np.linspace(0, frame_number - 1, frame_number, dtype=int)
+        # 拼接模式
+        replace_str = ""
+        for idx in indices:
+            frame_str = f"{self.image_start_tag}{os.path.join(frame_path, f'{idx}.jpg')}{self.image_end_tag}"
+            replace_str += frame_str
+        return replace_str
+    def _get_video_frame_str(self, video_info, return_mm_data = True ):
+        try:
+            video_info = ujson.loads(video_info)
+            if 'local' in video_info.keys():
+                # 获取包含多帧图像路径的字符串，最大帧数量max_frame_number
+                frames_str = self._split_video_to_frames(video_info, max_frame_number=self.config.video_config.max_frame_num, decode_way=self.config.video_config.decode_way)
+                if frames_str != "":
+                    parts = frames_str.split(self.image_end_tag)
+                    result = []
+                    for part in parts:
+                        if self.image_start_tag in part:
+                            before_path, path = part.split(self.image_start_tag)
+                            new_path = f'{self.image_start_tag}{{"local": "{path}"}}{self.image_end_tag}'
+                            result.append(before_path + new_path)
+                        else:
+                            result.append(part)
+                    return ''.join(result)
+            else:
+                raise ValueError('can not find localpath in video_info')
+        except Exception as e:
+            print("**** get video error: {}, info: {} *****".format(str(e), str(video_info)))
+        return ""
+    # def _replace_audio(self, audio_text, return_mm_data = True):
+        # audio_info = re.sub(re.compile(self.audio_start_tag + "|" + self.audio_end_tag), '', audio_text)
+        # ret = self._get_audio(audio_info, return_mm_data)  # 重复取结果 cached result
+    def _replace_audio(self, audio_text, mminfo_ret_dict):
+        audio_info = re.sub(re.compile(self.audio_start_tag + "|" + self.audio_end_tag), '', audio_text)
+        # ret = self._get_audio(audio_info)  # 重复取结果 cached result
+        ret = mminfo_ret_dict.get(audio_info, BaichuanProcessorOutput())    # 直接从字典取
+        if ret.bridge_length is not None:  # TODO 如果pad token很多 tokenizer效率会很低
+            replaced_text = [self.audio_pad_tag * l for l in ret.bridge_length]
+            replaced_text = self.audio_delim_tag.join(replaced_text)
+            return self.audio_start_tag + replaced_text + self.audio_end_tag
+        return ''
+    # def _replace_image(self, image_text, return_mm_data = True):
+    #     image_info = re.sub(re.compile(self.image_start_tag + "|" + self.image_end_tag), '', image_text)
+    #     ret = self._get_image(image_info, return_mm_data)  # 重复取结果 cached result
+    def _replace_image(self, image_text, mminfo_ret_dict):
+        image_info = re.sub(re.compile(self.image_start_tag + "|" + self.image_end_tag), '', image_text)
+        # ret = self._get_image(image_info)  # 重复取结果 cached result
+        ret = mminfo_ret_dict.get(image_info, BaichuanProcessorOutput())    # 直接从字典取
+        if ret.patch_nums is None:
+            return ''
+        return self.image_start_tag + self.image_pad_tag * ret.patch_nums[0] + self.image_end_tag
+        return ''
+    # def _replace_video_frame(self, video_frame_text, return_mm_data = True):
+        # video_frame_info = re.sub(re.compile(self.image_start_tag + "|" + self.image_end_tag), '', video_frame_text)
+        # ret = self._get_video_frame(video_frame_info, return_mm_data)  # 重复取结果 cached result
+    def _replace_video_frame(self, video_frame_text, mminfo_ret_dict):
+        video_frame_info = re.sub(re.compile(self.video_start_tag + '|' + self.video_end_tag), '', video_frame_text)
+        video_frame_info = re.sub(re.compile(self.image_start_tag + "|" + self.image_end_tag), '', video_frame_info)
+        # ret = self._get_video_frame(video_frame_info)  # 重复取结果 cached result
+        ret = mminfo_ret_dict.get(video_frame_info, BaichuanProcessorOutput())
+        if ret.videos_patch_nums is None:
+            return ''
+        video_frame_str = [self.image_start_tag + self.video_place_tag * ret.videos_patch_nums[i] + self.image_end_tag for i in range(len(ret.videos_patch_nums))]
+        return ''.join(video_frame_str)
+    def extract_replace_multimodal(self, text, mtype='audio', return_mm_data = True):
+        # 抽取text中的json格式音频/图像信息，读取并转化为特征，同时估计encoder token数，填入对应数量的pad token
+        if (self.audio_start_tag != None) and (mtype == 'audio'):
+            match_regex = re.compile(self.audio_start_tag + '.*?' + self.audio_end_tag)
+            drop_regex = re.compile(self.audio_start_tag + "|" + self.audio_end_tag)
+            extract_func = self._get_audio
+            replace_func = self._replace_audio
+        elif (self.image_start_tag != None) and (mtype == 'image'):
+            match_regex = re.compile(self.image_start_tag + '.*?' + self.image_end_tag)
+            drop_regex = re.compile(self.image_start_tag + "|" + self.image_end_tag)
+            extract_func = self._get_image
+            replace_func = self._replace_image
+        elif (self.video_start_tag != None) and (mtype == 'video'):
+            video_match_regex = re.compile(self.video_start_tag + '.*?' + self.video_end_tag)
+            video_drop_regex = re.compile(self.video_start_tag + "|" + self.video_end_tag)
+            # 处理视频，将视频路径转换为多帧图像路径
+            mm_info_list = re.findall(video_match_regex, text)
+            for mm_info in mm_info_list:
+                frame_str = self._get_video_frame_str(re.sub(video_drop_regex, '', mm_info))
+                # 替换路径；如果视频不存在，路径替换为空字符串
+                text = re.sub(mm_info, self.video_start_tag + frame_str + self.video_end_tag, text)
+            # 采用多图像处理方式
+            match_regex = re.compile(self.video_start_tag+r'(.*?)'+self.video_end_tag)
+            drop_regex = re.compile(self.image_start_tag + "|" + self.image_end_tag)
+            extract_func = self._get_video_frame
+            replace_func = self._replace_video_frame
+        else:
+            raise ValueError("mtype not supportted!")
+        mm_info_list = re.findall(match_regex, text)
+        mm_info_list = [re.sub(drop_regex, '', mm_info) for mm_info in mm_info_list]
+        mminfo_ret_dict = {}
+        ret = BaichuanProcessorOutput()
+        for mm_info in mm_info_list:  # 如果没有匹配到对应的模态 直接返回raw_text=text 结果不会是None
+            mm_ret = extract_func(mm_info, return_mm_data = return_mm_data)
+            mminfo_ret_dict[mm_info] = mm_ret
+            if mm_ret.audios is None and mm_ret.images is None and mm_ret.videos is None:  # 数据包含音频/图像/视频但抽取失败 整条数据无效（ret的raw_text为None
+                return ret
+            ret = ret.concatenate(mm_ret)  # 可能有多条结果，初步collect
+        # ret.raw_text = re.sub(match_regex, lambda x: replace_func(x.group()), text)
+        ret.raw_text = re.sub(match_regex, lambda x: replace_func(x.group(), mminfo_ret_dict), text)
+        return ret
+    def process_one(self, text, index=0, raw_only=False, return_mm_data = True):
+        ret = BaichuanProcessorOutput(index=index)
+        for mtype in self.config.multimodal:  # 循环获取音频 图像结果 并更新raw_text字段
+            mret = self.extract_replace_multimodal(text, mtype, return_mm_data = return_mm_data) # 增加获取视频结果
+            if mret.raw_text is None:  # 数据包含音频但音频获取失败
+                return BaichuanProcessorOutput(index=index)
+            ret = ret.concatenate(mret)
+            text = mret.raw_text
+            ret.raw_text = text
+        if raw_only:
+            return ret  # 兼容SFT等自定义tokenizer逻辑的代码
+        # 处理预训练中的trainable部分
+        input_ids, labels = [], []
+        trainable_sep = re.findall(r'<trainable_start>|<trainable_end>', ret.raw_text.replace('\n', '<LF>'))
+        if len(trainable_sep) <= 0:
+            input_ids = self.tokenizer(ret.raw_text, padding='do_not_pad', truncation=True, return_tensors="np")['input_ids'][0].tolist()
+            labels = [True for _ in input_ids]
+        else:
+            split_content = re.split(r'<trainable_start>|<trainable_end>', ret.raw_text)
+            for i, sc in enumerate(split_content):
+                if len(sc.strip()) == 0:
+                    continue  # 把多余的空格干掉
+                sc_ids = self.tokenizer(sc, padding='do_not_pad', truncation=True, return_tensors="np")['input_ids'][0].tolist()
+                input_ids.extend(sc_ids)
+                if i == 0 or trainable_sep[i - 1] == '<trainable_end>':  # stop gradient
+                    labels.extend([False] * len(sc_ids))
+                else:
+                    labels.extend([True] * len(sc_ids))
+        # input_ids += [self.tokenizer.eos_token_id]
+        # labels += [True]
+        ret.labels = [input_ids[j] if (l and input_ids[j] not in self.config.multimodal_special_token_no_loss_list) else -100 for j, l in enumerate(labels)]
+        ret.input_ids = input_ids
+        ret.index = index
+        return ret
+    @torch.no_grad()
+    def __call__(self, example, parallel=8):
+        # 最终入口 支持预训练数据string，sft数据message， 以及 batch推理数据listofstring 3种形式
+        if isinstance(example, Dict):
+            pass
+        elif isinstance(example, str):
+            return self.process_one(example)
+        elif isinstance(example, List):  # batch推理 异步多线程处理
+            with cf.ThreadPoolExecutor(min(parallel, len(example))) as executor:
+                future_list = [executor.submit(self.process_one, di, idx) for idx, di in enumerate(example)]
+                batch_data = [key.result() for key in cf.as_completed(future_list)]
+            valid_num = sum([1 if x.input_ids is not None else 0 for x in batch_data])
+            assert(valid_num == len(batch_data))  # 推理数据严格要求数量对齐
+            batch_data = sorted(batch_data, key=lambda x: x.index)  # 保证顺序不变
+            ret = BaichuanProcessorOutput()
+            for i in range(len(batch_data)):
+                ret = ret.concatenate(batch_data[i])
+            self.tokenizer.padding_side = "left"
+            padding_result = self.tokenizer.pad({"input_ids": [r.input_ids for r in batch_data]}, return_tensors='pt')
+            ret.input_ids = padding_result["input_ids"]
+            ret.attention_mask = padding_result["attention_mask"]  # batch推理不pack 不需要seqlens
+            padding_result = self.tokenizer.pad({"input_ids": [r.labels for r in batch_data]}, return_tensors='pt')
+            ret.labels = padding_result["input_ids"]
+            if ret.audios is not None:
+                ret.audios = default_collate(ret.audios)
+                ret.encoder_length = default_collate(ret.encoder_length)
+                ret.bridge_length = default_collate(ret.bridge_length)
+            if ret.images is not None:
+                ret.images = [torch.from_numpy(np.asarray(image, dtype=np.float32)) for image in ret.images]
+                # else:ret.images = default_collate(ret.images)
+                # ret.patch_nums = default_collate(ret.patch_nums)
+            if ret.videos is not None:
+                ret.images = [torch.from_numpy(np.asarray(image, dtype=np.float32)) for image in ret.videos]
+            return ret
+        else:
+            raise ValueError("example format supported yet")
+    @torch.no_grad()
+    def pack_batch_pretrain(self, raw_batch, max_sequence_length=None, parallel=8):
+        if max_sequence_length is None:
+            max_sequence_length = self.tokenizer.model_max_length
+        # 将N条数据pack为M条 max_sequence_length长度的数据, 每条数据包含所属的多模态输入
+        assert isinstance(raw_batch, List)
+        start_ts = time.time()
+        if parallel > 1:
+            with cf.ThreadPoolExecutor(max_workers=parallel) as executor:
+                future_list = []
+                for idx, json_text in enumerate(raw_batch):
+                    try:  # 读取json
+                        json_obj = ujson.loads(json_text.strip())
+                    except:
+                        try:
+                            json_obj = ast.literal_eval(json_text.strip())
+                        except:
+                            print("parse json obj faild: {}....".format(json_text[:300]))
+                            continue
+                    try: # chensong
+                        if isinstance(json_obj, list):
+                            content = json_obj[1]
+                        elif 'raw' in json_obj.keys():
+                            content = (json_obj["title"] if "title" in json_obj.keys() else "") + json_obj["raw"]
+                        else:
+                            content = (json_obj["title"] if "title" in json_obj.keys() else "") + json_obj["content"]
+                    except:
+                        print("parse json raw/content error: {}....".format(json_text[:300]))
+                        continue
+                    future_list.append(executor.submit(self.process_one, content, idx))
+                # 获取结果 乱序
+                batch_data = [key.result() for key in cf.as_completed(future_list)]
+        else: # debug only
+            batch_data = []
+            for json_text in raw_batch:
+                data = ujson.loads(json_text.strip())
+                if 'raw' in data.keys():
+                    batch_data.append(self.process_one(data['raw'], 0))
+                else:
+                    batch_data.append(self.process_one(data['content'], 0))
+        if (time.time() - start_ts) / (len(batch_data) + 1e-3) > 1.0:
+            print('[WARNING] processing each data cost more than 1.0s')
+        # packing 文本部分的输入，不做任何截断
+        current_length, packed_output, output = 0, BaichuanProcessorOutput(position_ids=[], seqlens=[]), []
+        empty_data = BaichuanProcessorOutput(input_ids=[], labels=[])
+        for idx, bd in enumerate(batch_data + [empty_data]):  # 加空数据方便appedn最后一个数据到output，防止遗漏
+            if bd.input_ids is None and idx < len(batch_data):
+                continue  # 数据没取到 并且不是最后一个
+            if (len(bd.input_ids) <= 0 or len(bd.input_ids) + 1 > max_sequence_length) and idx < len(batch_data):
+                continue  # 太长的直接不要 并且不是最后一个
+            if current_length + len(bd.input_ids) + 1 > max_sequence_length or idx == len(batch_data):
+                pad_nums = max_sequence_length - current_length  # right padding
+                if packed_output.input_ids is None or packed_output.labels is None:
+                    packed_output.input_ids = [self.tokenizer.pad_token_id] * pad_nums
+                    packed_output.labels = [-100] * pad_nums
+                    packed_output.position_ids += [0] * (pad_nums+1)
+                else:
+                    packed_output.input_ids += [self.tokenizer.pad_token_id] * pad_nums
+                    packed_output.labels += [-100] * pad_nums
+                    packed_output.position_ids += [0] * pad_nums
+                packed_output.attention_mask = [1] * current_length + [0] * pad_nums
+                packed_output.seqlens += [0] * (max_sequence_length - len(packed_output.seqlens))
+                output.append(packed_output)
+                packed_output = BaichuanProcessorOutput(position_ids=[], seqlens=[])  # reset empty
+            packed_output = packed_output.concatenate(bd)
+            packed_output.input_ids.append(self.tokenizer.eos_token_id)  # </s>需要单独加
+            packed_output.labels.append(self.tokenizer.eos_token_id)
+            packed_output.position_ids.extend(list(range(len(bd.input_ids) + 1)))
+            packed_output.seqlens.append(len(bd.input_ids) + 1)
+            current_length = len(packed_output.input_ids)
+        return output
+    @torch.no_grad()
+    def collect_batch_pretrain(self, batch_data):
+        ret = BaichuanProcessorOutput()
+        for i in range(len(batch_data)):
+            ret = ret.concatenate(batch_data[i])
+        ret.input_ids = default_collate([np.asarray(x.input_ids, dtype=np.int64) for x in batch_data]).cuda(non_blocking=True)
+        ret.labels = default_collate([np.asarray(x.labels, dtype=np.int64) for x in batch_data]).cuda(non_blocking=True)
+        ret.attention_mask = default_collate([np.asarray(x.attention_mask, dtype=np.float32) for x in batch_data]).cuda(non_blocking=True)
+        ret.position_ids = default_collate([np.asarray(x.position_ids, dtype=np.int64) for x in batch_data]).cuda(non_blocking=True)
+        ret.seqlens = default_collate([np.asarray(x.seqlens, dtype=np.int64) for x in batch_data]).cuda(non_blocking=True)
+        ret.raw_text = None
+        if ret.audios is not None:
+            ret.audios = default_collate(np.asarray(ret.audios, dtype=np.float32)).cuda(non_blocking=True)
+            ret.encoder_length = default_collate(np.asarray(ret.encoder_length, dtype=np.int32)).cuda(non_blocking=True)
+            ret.bridge_length = default_collate(np.asarray(ret.bridge_length, dtype=np.int32)).cuda(non_blocking=True)
+        if ret.images is not None:
+            ret.images = [torch.from_numpy(np.asarray(image, dtype=np.float32)).cuda(non_blocking=True)  for image in ret.images]#default_collate(np.asarray(ret.images, dtype=np.float32)).cuda(non_blocking=True)
+            ret.patch_nums = default_collate(np.asarray(ret.patch_nums, dtype=np.int32)).cuda(non_blocking=True)
+        if ret.videos is not None:
+            ret.videos = [torch.from_numpy(np.asarray(video, dtype=np.float32)).cuda(non_blocking=True)  for video in ret.videos]#default_collate(np.asarray(ret.images, dtype=np.float32)).cuda(non_blocking=True)
+            ret.videos_patch_nums = default_collate(np.asarray(ret.videos_patch_nums, dtype=np.int32)).cuda(non_blocking=True)
+        return ret
+    @torch.no_grad()
+    def collect_batch_sft(self, batch_data):
+        # list of dict to dataclass
+        batch_data = [BaichuanProcessorOutput(**bd) for bd in batch_data]
+        ret = BaichuanProcessorOutput()
+        for i in range(len(batch_data)):
+            ret = ret.concatenate(batch_data[i])
+        ret.input_ids = default_collate([np.asarray(x.input_ids, dtype=np.int64) for x in batch_data])
+        ret.labels = default_collate([np.asarray(x.labels, dtype=np.int64) for x in batch_data])
+        ret.position_ids = default_collate([np.asarray(x.position_ids, dtype=np.int64) for x in batch_data])
+        ret.seqlens = default_collate([np.asarray(x.seqlens, dtype=np.int64) for x in batch_data])
+        ret.raw_text = None
+        if ret.audios is not None:
+            ret.audios = default_collate(np.asarray(ret.audios, dtype=np.float32))
+            ret.encoder_length = default_collate(np.asarray(ret.encoder_length, dtype=np.int32))
+            ret.bridge_length = default_collate(np.asarray(ret.bridge_length, dtype=np.int32))
+        if ret.images is not None:
+            # 转换 每个image 为torch tensor
+            ret.images = [torch.from_numpy(np.asarray(image, dtype=np.float32))  for image in ret.images]#default_collate(np.asarray(ret.images, dtype=np.float32)).cuda(non_blocking=True)
+        if ret.videos is not None:
+            ret.videos = [torch.from_numpy(np.asarray(video, dtype=np.float32))  for video in ret.videos]#default_collate(np.asarray(ret.images, dtype=np.float32)).cuda(non_blocking=True)
+            # ret.patch_nums = default_collate(np.asarray(ret.patch_nums, dtype=np.int32)).cuda(non_blocking=True)
+        ret = ret.__dict__
+        del ret['patch_nums']
+        del ret['images_size']
+        del ret['crop_size']
+        del ret['raw_text']
+        del ret['index']
+        del ret['attention_mask']
+        del ret['videos_patch_nums']
+        del ret['videos_size']
+        del ret['videos_crop_size']
+        return ret
+#######################################################
+## Unit Test Functions, usage
+## python processor_baichuan.py test
+#######################################################
+def test_img_processor():
+    from transformers import AutoConfig
+    from transformers.models.clip import CLIPImageProcessor
+    config = AutoConfig.from_pretrained("./", trust_remote_code=True)
+    processor = BaichuanImageProcessor(config.visual_config)
+    offical_processor = CLIPImageProcessor(size=config.visual_config.crop_size, crop_size=config.visual_config.crop_size,
+                        image_mean=config.visual_config.image_mean, image_std=config.visual_config.image_std,
+                        do_convert_rgb=True)
+    img_files = ['sogou/7a2c8ffc1bc61146b32805c3390f42e2', 'wukong/77c1db1c0e4200d12b478c33ba3a412d', 'wukong/62e9a5c8eb8b0ea8858a34ba3f1a999f', 'wukong/fb9ab4d7c3fe9f54289948fd6a57fc30']
+    cos_client = CosClient()
+    for img_file in img_files:
+        img_bytes = cos_client(img_file)
+        img_rbg = Image.open(io.BytesIO(img_bytes))
+        image, org_size = processor.image_transform(img_bytes)
+        offical_image = offical_processor.preprocess([img_rbg],
+                        do_resize=True, do_center_crop=True, do_rescale=True, do_normalize=True,
+                        return_tensors='np').data['pixel_values'][0]
+        print('-'*60)
+        print(np.array(img_rbg).shape)
+        print(image.shape)
+        print(offical_image.shape)
+        print(image - offical_image)
+def test_audio_processor():
+    from transformers.models.whisper import WhisperFeatureExtractor
+    from transformers import AutoConfig
+    config = AutoConfig.from_pretrained("./", trust_remote_code=True)
+    offical_processor = WhisperFeatureExtractor(feature_size=128)
+    processor = BaichuanAudioProcessor(config.audio_config)
+    # wave_files = glob.glob('/home/nfs_bc_alignment/sunhaoze/audio-data/openaqa/openaqa-as/audio/*')
+    wave_files = ['/home/nfs_bc_alignment/sunhaoze/sounds/audioset_full/7ZY0U5tfKyQ.flac', '/home/nfs_bc_alignment/sunhaoze/sounds/audioset_full/Osly4Shchs4.flac']
+    for wave_file in wave_files:
+        wave = processor.load_audio_waveform(wave_file, True, False)
+        offical_features = offical_processor(wave[0].numpy(), do_normalize=False)
+        feat = offical_features['input_features'][0]
+        wave, frame_nums = processor.extract_fbank_features(wave)
+        print("="*60)
+        print(feat.shape)
+        print(wave.shape, frame_nums)
+        print('the difference between offical extractor and our implementation: {}'.format(wave_file))
+        print(wave[:, :frame_nums] - feat[:, :frame_nums])
+        print(wave)
+        # print(wave[120:-1, :])
+        # print(feat[120:-1, :wave.shape[1]])
+        zeros_before = np.sum(wave == 0)
+        aug = processor.data_augment(wave, frame_nums)
+        zeros_after = np.sum(aug == 0)
+        print(zeros_before, zeros_after)
+def test_audio_long():  # 测试超过30秒音频的截断策略
+    from transformers import AutoConfig, AutoTokenizer
+    config = AutoConfig.from_pretrained("./", trust_remote_code=True)
+    config.audio_config.split_overlap = 1
+    tokenizer = AutoTokenizer.from_pretrained("./", model_max_length=4096)
+    processor = BaichuanMMProcessor(tokenizer, config, True)
+    examples = ["<audio_start_baichuan>{\"path\": \"panda\/testdata\/podcast_demo_30s\/easy_chat_xianliaohuier_30s\/easy_chat_xianliaohuier-133.mp3\"}<audio_end_baichuan>What is the level of noise from the speech?\n<trainable_start>The speech energy\n is medium.<trainable_end>",
+             "what's the sound's energy? \n sound1 <audio_start_baichuan>{\"path\": \"panda\/testdata\/podcast_demo_30s\/btrt_talk_heihua_30s\/btrt_talk_heihua-116.mp3\"}<audio_end_baichuan> \n sound2 <audio_start_baichuan>{\"path\": \"panda\/testdata\/podcast_demo_30s\/btrt_talk_heihua_30s\/btrt_talk_heihua-221.mp3\"}<audio_end_baichuan>The speech energy is medium.",
+            ]
+    ret = processor(examples)
+    print(ret)
+    print(torch.sum(ret.input_ids == 151659))
+    print(torch.sum(ret.input_ids == 151674))
+def test_processor():
+    from transformers import AutoConfig, AutoTokenizer
+    config = AutoConfig.from_pretrained("./", trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained("./", model_max_length=4096)
+    processor = BaichuanMMProcessor(tokenizer, config, True, '/home/nfs_bc_alignment/sunhaoze/sounds')
+    examples = ["<audio_start_baichuan>{\"path\": \"vggsound\/7DH5fqj8j6Q.flac\"}<audio_end_baichuan>What is the level of noise from the speech?\n<trainable_start>The speech energy\n is medium.<trainable_end>",
+             "hello, baichuan 你好 百川智能。",
+             "what's the sound's energy? \n <audio_start_baichuan>{\"path\": \"iemocap\/Ses01F_script01_3_F022.wav\"}<audio_end_baichuan>The speech energy is medium.",
+             "sound1: <audio_start_baichuan>{\"path\": \"audioset_full\/9B53NVDNT8U.flac\"}<audio_end_baichuan>\n sound2: \n<audio_start_baichuan>{\"path\": \"audioset_full\/a2dgzb9GDSQ.flac\"}<audio_end_baichuan>How is the speech speed related to the estimated speaker age?\n<trainable_start>The slow speech speed suggests a more deliberate and thoughtful approach often seen in mature individuals.<trainable_end>",
+             "<img_start_baichuan>{\"path\": \"sogou\/7351ae4f3fbe58ff0e4cc165cfabb3ed\"}<img_end_baichuan>新和记潮汕牛肉火锅的牛肉丸好不好吃 用户评价口味怎么样 常州美食牛肉丸实拍图片 大众点评",
+             "这两个图片有什么关系?图片1<img_start_baichuan>{\"path\": \"sogou\/ac91d57ab68335913ed41aa283e76356\"}<img_end_baichuan>图片2\n<img_start_baichuan>{\"path\": \"sogou\/6ad5e632b74265d9ef689e45936ab1aa\"}<img_end_baichuan>",
+             "根据图片和语音给出描述\n图片<img_start_baichuan>{\"path\": \"sogou\/32274c1ab28d11f8c490cf7ae15b36f1\"}<img_end_baichuan>语音<audio_start_baichuan>{\"path\": \"voxceleb2\/id06726_s2lysJWkjus_00169.m4a\"}<audio_end_baichuan><trainable_start>这是一只猫<trainable_end>",
+             "这些图片和音频不存在<img_start_baichuan>{\"path\": \"soogou\/32274c1ab28d11f8c490cf7ae15b36f1\"}<img_end_baichuan>语音<audio_start_baichuan>{\"path\": \"voxceleb_1\/id06726_s2lysJWkjus_00169.m4a\"}<audio_end_baichuan><trainable_start>这是一只猫<trainable_end>"
+            ]
+    ret = processor(examples[4:-1])
+    print(ret)
+    print(torch.sum(ret.input_ids == 151659))
+    print(torch.sum(ret.input_ids == 151662))
+    try:
+        print(ret.bridge_length)
+        print(ret.patch_nums)
+    except:
+        pass
+    print(torch.sum(ret.attention_mask, dim=1))
+def test_grounding():
+    from transformers import AutoConfig, AutoTokenizer
+    config = AutoConfig.from_pretrained("./", trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained("./", model_max_length=4096)
+    processor = BaichuanMMProcessor(tokenizer, config, True, '/home/nfs_bc_alignment/sunhaoze/sounds')
+    examples = ["<img_start_baichuan>{\"path\": \"grit\/663423bf2f0884c034bf75279bce9694\"}<img_end_baichuan>\nWhere is \"A woman\" ? Answer: <trainable_start>The bounding box is <box_start_baichuan>(0.58,0.8),(0.71,1.0)<box_end_baichuan><trainable_end>",
+             "hello, baichuan 你好 百川智能。",
+             "<img_start_baichuan>{\"path\": \"grit\/0e6e3952c584cbac7235940a22514656\"}<img_end_baichuan> Generate the caption with grounding: <trainable_start>Photo pour Portrait of <ref_start_baichuan>young Asian muslim woman wearing hijab<ref_end_baichuan><box_start_baichuan>(0.09,0.01),(0.77,1.0)<box_end_baichuan> shows regret gesture, hand on her forehead, forget something important, against red background - image libre de droit<trainable_end>",
+             "Recognize the object in the outlined section <img_start_baichuan>{\"path\": \"grit\/045823cf6f819670f27aee20af7ae0e6\"}<img_end_baichuan> of the picture.<box_start_baichuan>(0.07,0.2),(0.91,0.96)<box_end_baichuan>\n<trainable_start>Inflatable water trampolines<trainable_end>"
+            ]
+    ret = processor(examples)
+    print(ret)
+    for i, input_ids in enumerate(ret.input_ids):
+        print("="*60)
+        print(ret.labels[i])
+def test_pack():
+    from transformers import AutoConfig, AutoTokenizer
+    config = AutoConfig.from_pretrained("./", trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained("./", model_max_length=2048)
+    processor = BaichuanMMProcessor(tokenizer, config, True, '/home/nfs_bc_alignment/sunhaoze/sounds')
+    examples = open('/cpfs/29f69eb5e2e60f26/user/sunhaoze/pretrain-v6/sogou/part-00000').readlines()[:5]
+    examples += open('/home/nfs_bc_alignment/sunhaoze/text/openaqa-as-stage2-v1/part-00000').readlines()[:5]
+    random.shuffle(examples)
+    batch_output = processor.pack_batch_pretrain(examples)
+    for i, b in enumerate(batch_output):
+        print('='*60)
+        try:
+            print(b.input_ids, len(b.input_ids))
+            print(b.labels, len(b.labels))
+            print(b.attention_mask, len(b.attention_mask))
+            print(b.position_ids, len(b.position_ids))
+            print(b.seqlens, len(b.seqlens))
+            print(b.audios)
+            print(b.bridge_length)
+        except:
+            continue
+    batch_for_model = processor.collect_batch_pretrain(batch_output)
+    print(batch_for_model.input_ids.shape)
+    print(batch_for_model.labels.shape)
+    print(batch_for_model.audios.shape)
+    print(batch_for_model["bridge_length"])
+    print(batch_for_model.images.shape)
+    print(batch_for_model["patch_nums"])
+    print(batch_for_model["position_ids"])
+    print(batch_for_model["seqlens"])
+def test_cos_audio():
+    cos_client = CosClient()
+    audio_bytes = cos_client('panda/data/common_voice/cv-corpus-18.0-2024-06-14/zh-CN/clips/common_voice_zh-CN_19428637.mp3', 'audio-data-1317568651')
+    wave, sr = torchaudio.load(audio_bytes, normalize=False)
+    print(wave.shape, sr)
+    # torchaudio.save('tmp.flac', wave, sr)
+if __name__ == '__main__':
+    fire.Fire()

pytorch_model-00001-of-00002.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fb5965836360babacb5a264aa538bc57a37fed34310f0556465399894ace7646
+size 4996313145

pytorch_model-00002-of-00002.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c68f9d8fff8335effaeaffb17a57e56d62326e9cc1c8c2185e98c95dc530a4a
+size 2511631122

pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,833 @@

+{
+  "metadata": {
+    "total_size": 7507663872
+  },
+  "weight_map": {
+    "lm_head.weight": "pytorch_model-00001-of-00002.bin",
+    "model.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.19.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.19.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.20.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.20.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.20.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.20.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.20.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.20.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.21.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.21.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.21.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.21.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.21.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.21.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.22.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.22.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.22.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.22.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.22.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.22.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.23.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.23.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.23.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.23.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.23.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.23.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.30.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.30.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.30.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.30.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.30.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.30.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.32.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.32.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.32.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.32.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.32.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.32.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.32.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.32.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.32.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.32.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.32.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.32.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.33.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.33.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.33.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.33.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.33.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.33.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.33.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.33.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.33.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.33.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.33.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.33.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.34.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.34.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.34.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.34.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.34.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.34.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.34.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.34.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.34.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.34.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.34.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.34.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.35.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.35.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.35.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.35.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.35.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.35.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.35.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.35.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.35.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.35.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.35.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.layers.35.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.norm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_bridge_model.ln_q.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_bridge_model.ln_q.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_bridge_model.mlp.0.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_bridge_model.mlp.0.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_bridge_model.mlp.2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_bridge_model.mlp.2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.0.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.0.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.0.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.0.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.0.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.0.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.0.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.0.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.0.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.0.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.0.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.0.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.1.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.1.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.1.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.1.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.1.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.1.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.1.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.1.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.1.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.1.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.1.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.1.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.10.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.10.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.10.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.10.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.10.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.10.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.10.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.10.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.10.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.10.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.10.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.10.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.11.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.11.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.11.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.11.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.11.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.11.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.11.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.11.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.11.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.11.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.11.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.11.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.12.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.12.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.12.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.12.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.12.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.12.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.12.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.12.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.12.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.12.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.12.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.12.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.13.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.13.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.13.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.13.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.13.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.13.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.13.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.13.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.13.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.13.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.13.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.13.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.14.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.14.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.14.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.14.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.14.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.14.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.14.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.14.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.14.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.14.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.14.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.14.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.15.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.15.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.15.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.15.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.15.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.15.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.15.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.15.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.15.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.15.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.15.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.15.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.16.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.16.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.16.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.16.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.16.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.16.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.16.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.16.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.16.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.16.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.16.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.16.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.17.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.17.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.17.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.17.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.17.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.17.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.17.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.17.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.17.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.17.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.17.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.17.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.18.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.18.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.18.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.18.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.18.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.18.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.18.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.18.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.18.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.18.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.18.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.18.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.19.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.19.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.19.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.19.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.19.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.19.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.19.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.19.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.19.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.19.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.19.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.19.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.2.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.2.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.2.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.2.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.2.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.2.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.2.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.2.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.2.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.2.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.2.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.2.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.20.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.20.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.20.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.20.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.20.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.20.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.20.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.20.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.20.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.20.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.20.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.20.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.21.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.21.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.21.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.21.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.21.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.21.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.21.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.21.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.21.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.21.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.21.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.21.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.22.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.22.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.22.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.22.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.22.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.22.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.22.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.22.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.22.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.22.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.22.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.22.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.23.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.23.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.23.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.23.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.23.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.23.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.23.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.23.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.23.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.23.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.23.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.23.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.24.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.24.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.24.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.24.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.24.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.24.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.24.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.24.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.24.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.24.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.24.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.24.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.25.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.25.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.25.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.25.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.25.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.25.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.25.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.25.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.25.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.25.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.25.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.25.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.26.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.26.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.26.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.26.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.26.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.26.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.26.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.26.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.26.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.26.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.26.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.26.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.27.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.27.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.27.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.27.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.27.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.27.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.27.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.27.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.27.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.27.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.27.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.27.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.28.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.28.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.28.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.28.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.28.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.28.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.28.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.28.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.28.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.28.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.28.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.28.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.29.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.29.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.29.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.29.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.29.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.29.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.29.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.29.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.29.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.29.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.29.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.29.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.3.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.3.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.3.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.3.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.3.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.3.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.3.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.3.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.3.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.3.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.3.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.3.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.30.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.30.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.30.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.30.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.30.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.30.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.30.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.30.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.30.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.30.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.30.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.30.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.31.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.31.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.31.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.31.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.31.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.31.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.31.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.31.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.31.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.31.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.31.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.31.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.4.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.4.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.4.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.4.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.4.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.4.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.4.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.4.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.4.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.4.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.4.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.4.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.5.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.5.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.5.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.5.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.5.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.5.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.5.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.5.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.5.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.5.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.5.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.5.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.6.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.6.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.6.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.6.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.6.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.6.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.6.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.6.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.6.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.6.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.6.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.6.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.7.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.7.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.7.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.7.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.7.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.7.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.7.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.7.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.7.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.7.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.7.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.7.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.8.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.8.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.8.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.8.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.8.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.8.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.8.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.8.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.8.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.8.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.8.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.8.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.9.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.9.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.9.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.9.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.9.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.9.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.9.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.9.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.9.norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.9.norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.9.norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.blocks.9.norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "model.visual_model.patch_embed.proj.weight": "pytorch_model-00001-of-00002.bin"
+  }
+}

rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08282b46825aa78d10fe10e3fea89555c5b5a691b261a3ddfd58fcb58370edff
+size 15984

rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dbab71d98a3a9a92df82a6bba463947327c3a1bcf35cd9f4f46114641fc42dd9
+size 15984

rng_state_10.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:54e5c2d65c14df39137f6a3bd8314f534dad93d72efe8e40e23a61187ba74ce8
+size 15997

rng_state_11.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f578146bcaaed333cf4637aa3b454e2f6238e691e45b4160001b63b4f8b5ce4e
+size 15997

rng_state_12.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8463662162a837871bb50af43a407175ca95f3f364bc13c7d065e8b6929bad11
+size 15997

rng_state_13.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe9bafd75244b907f13aa5b102963f4d018d7b80b94ddb0262313774362d9305
+size 15997

rng_state_14.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5e353c1ca54c464e3464d695507da7da6c134e4ad9d6acab6f72e52d5bf13c7
+size 15997

rng_state_15.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24f1ef20cfc904ac8f811ceffab91f00cccadca0e878c8ef8a7335e83ef9513f
+size 15997

rng_state_16.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cecf995eb1b13a85a673384ed67c75acc989e9d373ac7da2d90cbc7af2c1b4a7
+size 15997

rng_state_17.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:375986c2a348f319aeed3322fcf23be7787f801c585046ff217d7855463fbc0d
+size 15997

rng_state_18.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a50dfdcb902d78e55850cabc410552a4e651090792c2e086926cbda383b4f35f
+size 15997

rng_state_19.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79850d41cfed8d83ef96222b2b956a0ec273b7fa78630d3cb0e86f2ab29bb934
+size 15997

rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:caac82d57d878d30219a4f9ec289a97ff90c53afc160b968f251b3fd3454b8d8
+size 15984

rng_state_20.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60201c3050f5fddd8ba67c9cc8ea589b8296872acef041a46972a75489ba48f2
+size 15997

rng_state_21.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b45ccf4c1d4fa82ab73d5b1084c8895bb7a843bb65c39bb0b2bb2a6192e6df3
+size 15997

rng_state_22.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:54521e12e4ad123a5cdba9d3c092e63ee37b099ff802bbe417dfb47861244ab9
+size 15997

rng_state_23.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4917fcd537e822a9012f654e3131d5dbd358e1916dec368f464d89c1223a2e5
+size 15997

rng_state_24.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6b078914d8fc5b3568727c1679a97889d10f35460aaa0147c1ab343e0a8576f
+size 15997

rng_state_25.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58235624c008372a4489d1dc4d4a5d582e83b5954529d532a8ac9e27cc1117eb
+size 15997

rng_state_26.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d0a9e394ce0c35df215df576763794e20589536fd7d2e8aae0d2bfaac14eb95
+size 15997

rng_state_27.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a40fc515e550756e1b3497c9e62fa179918e76adb87ab3233db714b992268c8
+size 15997

rng_state_28.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6dadf1f4ae507c5e5ab940441273a3552a7e59fcc516e7dfddd1956d1c5c5a3d
+size 15997

rng_state_29.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73dfc967a343045291988f3bcbbdee93e0877bb869db48fd675e9946c2c43507
+size 15997

rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:19762d2d370222b01817da11bbaa6665d542293373186d66f754e7246bb861ed
+size 15984

rng_state_30.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6890d56ed291634fd39a3901c77e10c249a355684a11b3e34af07a400ff9189c
+size 15997

rng_state_31.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25d52ac86e2e15b3ec76429e008562b498d6431769f7f3ad24c182d032f7701b
+size 15997

rng_state_4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00c7508b346a7d3c5c23392845f1d013331114ade778794b76e919cb3ed5d33e
+size 15984

rng_state_5.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b89de7d14dd20a191f56b74c816ef8b7fe5c171e31efbeadbf321c4539ed68c3
+size 15984

rng_state_6.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c71152053553e6e22d670fbc4fd7550bf8a046b54cad7b71869787986a6a42c
+size 15984

rng_state_7.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b67db12a26a26ffe03d9afc84a43857eb2e5b2fec2dd189653b415f74208190
+size 15984

rng_state_8.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50d06b29f11da5c2e4a388e6620296a543a68f4d46bb2bb978188a8e374f8925
+size 15984

rng_state_9.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bde660db671106d0d18882a4db0faf4fe72837df15f5537708435e1d684a549e
+size 15984

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0932b63b31d2497f06e828818648fb5bc42ad70519e567bde3e0767ef1aa3288
+size 1128

sequence_parallel_utils.py ADDED Viewed

	@@ -0,0 +1,186 @@

+from typing import Any, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from flash_attn import flash_attn_varlen_func
+try:
+    import deepspeed.comm as dist
+except:
+    dist = None
+try:
+    from utils import (
+        get_sequence_parallel_group,
+        get_sequence_parallel_size,
+        get_sequence_parallel_rank
+    )
+except (ModuleNotFoundError, ImportError):
+    # 从 utils 获取seq parallel设置，import不成功默认为不开启
+    get_sequence_parallel_group = lambda : None
+    get_sequence_parallel_size = lambda : 1
+    get_sequence_parallel_rank = lambda : 0
+def single_all_to_all(input, scatter_idx, gather_idx, group):
+    seq_world_size = dist.get_world_size(group)
+    inp_shape = list(input.shape)
+    inp_shape[scatter_idx] = inp_shape[scatter_idx] // seq_world_size
+    if scatter_idx < 2:
+        input_t = input.reshape(
+            [seq_world_size, inp_shape[scatter_idx]] + \
+            inp_shape[scatter_idx + 1:]
+        ).contiguous()
+    else:
+        # transpose groups of heads with the seq-len parallel dimension, so that we can scatter them!
+        input_t = input.reshape(
+            [-1, seq_world_size, inp_shape[scatter_idx]] + \
+            inp_shape[scatter_idx + 1:]
+        ).transpose(0, 1).contiguous()
+    output = torch.empty_like(input_t)
+    dist.all_to_all_single(output, input_t, group=group)
+    # if scattering the seq-dim, transpose the heads back to the original dimension
+    # [sp_size, seq_len//sp_size, batch_size, head_num // sp_size, head_dim] -->
+    # [seq_len//sp_size,batch_size, sp_size, head_num // sp_size, head_dim]
+    if scatter_idx < 2:
+        output = output.transpose(0, 1).transpose(1, 2).contiguous()
+    return output.reshape(
+        inp_shape[: gather_idx] + \
+        [inp_shape[gather_idx] * seq_world_size,] + \
+        inp_shape[gather_idx + 1:]).contiguous()
+class _SeqAllToAll(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx: Any, group: 'dist.ProcessGroup', input: Tensor, scatter_idx: int, gather_idx: int) -> Tensor:
+        ctx.group = group
+        ctx.scatter_idx = scatter_idx
+        ctx.gather_idx = gather_idx
+        return single_all_to_all(input, scatter_idx, gather_idx, group)
+    @staticmethod
+    def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor, None, None]:
+        return (None, _SeqAllToAll.apply(ctx.group, *grad_output, ctx.gather_idx, ctx.scatter_idx), None, None)
+# import from https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/sequence/layer.py
+# but fix some bugs for 符合训练的维度设置
+class DistributedAttention(nn.Module):
+    """Initialization.
+    Arguments:
+        local_attention (Module): local attention with q,k,v
+        sequence_process_group (ProcessGroup): sequence parallel process group
+        scatter_idx (int): scatter_idx for all2all comm
+        gather_idx (int): gather_idx for all2all comm
+    """
+    def __init__(
+        self,
+        local_attention: nn.Module,
+        sequence_process_group: 'dist.ProcessGroup',
+        scatter_idx: int = 2,
+        gather_idx: int = 0,
+    ) -> None:
+        super(DistributedAttention, self).__init__()
+        self.local_attn = local_attention
+        self.spg = sequence_process_group
+        self.scatter_idx = scatter_idx
+        self.gather_idx = gather_idx
+    def pad_attention_head(self, query: Tensor, key: Tensor, value: Tensor):
+        # 将输入的head 维度pad到sp_size的倍数
+        sp_size = torch.distributed.get_world_size(self.spg)
+        pad_size = (sp_size - query.size(1) % sp_size) % sp_size
+        if pad_size > 0:
+            # [bs, num_head, seq_len, head_dim] -> [bs, num_head+pad_size, seq_len, head_dim]
+            query = torch.nn.functional.pad(query, (0,0,0,0,0,pad_size), value = 0.01)
+            key = torch.nn.functional.pad(key, (0,0,0,0,0,pad_size), value = 0.01)
+            value = torch.nn.functional.pad(value, (0,0,0,0,0,pad_size),value=0.0)
+        return query, key, value
+    def forward(self, query: Tensor, key: Tensor, value: Tensor, *args: Any, **kwargs) -> Tensor:
+        """ forward
+        Arguments:
+            query (Tensor): query input to the layer [batch_size, num_head, seq_len, head_dim]
+            key (Tensor): key input to the layer
+            value (Tensor): value input to the layer
+            args: other args
+        Returns:
+            * output (Tensor): context output
+        """
+        # TODO Merge three alltoall calls into one
+        # TODO (Reza): change the api on the megatron-deepspeed side so that we only receive all data (q,k, and v) together!
+        # [batch_size,num_head,seq_len, head_dim ]trans to [seq_len,batch_size,num_head,head_dim]
+        origin_num_head = query.size(1)
+        query, key, value = self.pad_attention_head(query,key,value)
+        query = query.transpose(1,2).transpose(0,1)
+        key = key.transpose(1,2).transpose(0,1)
+        value = value.transpose(1,2).transpose(0,1)
+        #in shape : e.g.,  [s/p,bs,h,head_dim]
+        query_layer = _SeqAllToAll.apply(self.spg, query, self.scatter_idx, self.gather_idx).transpose(0,1).transpose(1,2).contiguous()
+        key_layer = _SeqAllToAll.apply(self.spg, key, self.scatter_idx, self.gather_idx).transpose(0,1).transpose(1,2).contiguous()
+        value_layer = _SeqAllToAll.apply(self.spg, value, self.scatter_idx, self.gather_idx).transpose(0,1).transpose(1,2).contiguous()
+        context_layer = self.local_attn(query_layer, key_layer, value_layer, *args, **kwargs)
+        context_layer = context_layer.transpose(0,1).contiguous()
+        # [seq_len, batch_size, num_head, head_dim]
+        output = _SeqAllToAll.apply(self.spg, context_layer, self.gather_idx, self.scatter_idx)
+        return output.transpose(0,1)[:,:,:origin_num_head,:]
+class LocalAttention(nn.Module):
+    def __init__(self, hidden_size, num_heads, head_dim):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+    def forward(self, q, k, v, *args, use_flash=True, **kwargs):
+        # input q,k,v [batch_size, num_head, seq_len, head_dim]
+        # output [batch_size, seq_len, num_head, head_dim]
+        if use_flash:
+            q_len, num_heads = q.shape[2], q.shape[1]
+            q = q.transpose(1,2).reshape(-1, num_heads, self.head_dim)
+            k = k.transpose(1,2).reshape(-1, num_heads, self.head_dim)
+            v = v.transpose(1,2).reshape(-1, num_heads, self.head_dim)
+            return flash_attn_varlen_func(q,k,v,*args, **kwargs).reshape(-1,q_len, num_heads, self.head_dim)
+        else:
+            with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_math=True, enable_mem_efficient=False):
+                attn_output = F.scaled_dot_product_attention(
+                    q,k,v, *args, **kwargs)
+            attn_output = attn_output.transpose(1, 2)
+            return attn_output
+def create_attention_layer(hidden_size, num_heads, head_dim):
+    if get_sequence_parallel_group() is None:
+        return LocalAttention(hidden_size, num_heads, head_dim)
+    else:
+        return DistributedAttention(
+            local_attention=LocalAttention(hidden_size, num_heads, head_dim),
+            sequence_process_group=get_sequence_parallel_group()
+        )
+def get_sequence_parallel_chunk(tensor, dim=1, shift=0):
+    assert tensor.size(dim) % get_sequence_parallel_size() == 0
+    original_size = tensor.size(dim)
+    if shift:
+        tensor = tensor.split([shift, tensor.size(dim) - shift], dim=dim)[1]
+    if get_sequence_parallel_group() is None:
+        return tensor
+    else:
+        chunk_size = original_size // get_sequence_parallel_size()
+        return tensor.split(chunk_size, dim=dim)[get_sequence_parallel_rank()]

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<B_SYS>",
+    "<B_USYS>",
+    "<C_Q>",
+    "<C_A>",
+    "<B_FUNC>",
+    "<B_CODE>",
+    "<B_APE>",
+    "<function_calling>",
+    "<calc_start>",
+    "<calc_end>",
+    "<inner_think>",
+    "<audio_start_baichuan>",
+    "<audio_end_baichuan>",
+    "<audio_pad_baichuan>",
+    "<img_start_baichuan>",
+    "<img_end_baichuan>",
+    "<img_pad_baichuan>",
+    "<img_newline_baichuan>",
+    "<box_start_baichuan>",
+    "<box_end_baichuan>",
+    "<box_delim_baichuan>",
+    "<ref_start_baichuan>",
+    "<ref_end_baichuan>",
+    "<img_delim_baichuan>",
+    "<polygon_start_baichuan>",
+    "<polygon_end_baichuan>",
+    "<baichuan_pad_token>",
+    "<reserved_113>",
+    "<audio_delim_baichuan>",
+    "<video_start_baichuan>",
+    "<video_end_baichuan>",
+    "<video_palce_baichuan>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff