diff --git a/mllm/flamingo/__init__.py b/mllm/flamingo/__init__.py
deleted file mode 100644
index 41b48ea349145d41194e67bc36ae054e6ef528c7..0000000000000000000000000000000000000000
--- a/mllm/flamingo/__init__.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from typing import TYPE_CHECKING
-
-from transformers.utils import (
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    is_torch_available,
-)
-
-
-_import_structure = {
-    "configuration_flamingo": [
-        "FlamingoConfig",
-    ],
-}
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_flamingo"] = [
-        "FlamingoModel",
-        "FlamingoPreTrainedModel",
-        "FlamingoForConditionalGeneration",
-    ]
-
-if TYPE_CHECKING:
-    from .configuration_flamingo import FlamingoConfig
-
-    # from .processing_flamingo import FlamingoProcessor
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_flamingo import (
-            FlamingoForConditionalGeneration,
-            FlamingoModel,
-            FlamingoPreTrainedModel,
-        )
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/mllm/flamingo/config.json b/mllm/flamingo/config.json
deleted file mode 100644
index fb502ff2a98b0f3a7649b7fdbb4461feb2c18eb6..0000000000000000000000000000000000000000
--- a/mllm/flamingo/config.json
+++ /dev/null
@@ -1,21 +0,0 @@
-{
-    "model_type": "flamingo",
-    "cross_attn_every_n_layers": 4,
-    "tie_word_embeddings": false,
-    "use_media_placement_augmentation": true,
-    "only_attend_previous": true,
-    "text_config": {
-        "_name_or_path": "luodian/llama-7b-hf",
-        "model_type": "llama"
-    },
-    "vision_config": {
-        "_name_or_path": "openai/clip-vit-large-patch14",
-        "model_type": "clip_vision_model",
-        "hidden_size": 1024,
-        "intermediate_size": 4096,
-        "num_attention_heads": 16,
-        "num_hidden_layers": 24,
-        "image_size": 224,
-        "patch_size": 14
-    }
-}
\ No newline at end of file
diff --git a/mllm/flamingo/configuration_flamingo.py b/mllm/flamingo/configuration_flamingo.py
deleted file mode 100644
index 08c6255a8dbe2f19e56e3c376f59104c4067415c..0000000000000000000000000000000000000000
--- a/mllm/flamingo/configuration_flamingo.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import copy
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-from transformers.models.auto import CONFIG_MAPPING
-from transformers.models.clip import CLIPVisionConfig
-import sys
-
-from .falcon.configuration_RW import RWConfig
-from .mpt.configuration_mpt import MPTConfig
-from .mpt_redpajama.configuration_mosaic_gpt import MosaicGPTConfig
-
-
-logger = logging.get_logger(__name__)
-
-
-class FlamingoConfig(PretrainedConfig):
-    r"""
-    [`FlamingoConfig`] is the configuration class to store the configuration of a [`FlamingoForConditionalGeneration`]. It is
-    used to instantiate a Flamingo model according to the specified arguments, defining the vision model and language model configs. Instantiating a configuration with the defaults will yield a similar configuration to
-    that of the Flamingo architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        vision_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`PretrainedConfig`].
-        text_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
-        cross_attn_every_n_layers (`int`, *optional*, defaults to 4):
-            The number of cross-attention layers adding after each transformer layer.
-
-        kwargs (*optional*):
-            Dictionary of keyword arguments.
-
-    Example:
-
-    ```python
-    >>> from transformers import (
-    ...     PretrainedConfig,
-    ...     OPTConfig,
-    ...     FlamingoConfig,
-    ...     FlamingoForConditionalGeneration,
-    ... )
-
-    >>> # Initializing a FlamingoConfig with Salesforce/Flamingo-opt-2.7b style configuration
-    >>> configuration = FlamingoConfig()
-
-    >>> # Initializing a FlamingoForConditionalGeneration (with random weights) from the Salesforce/Flamingo-opt-2.7b style configuration
-    >>> model = FlamingoForConditionalGeneration(configuration)
-    ```"""
-    model_type = "flamingo"
-    is_composition = True
-
-    def __init__(self, vision_config=None, text_config=None, cross_attn_every_n_layers: int = 4, use_media_placement_augmentation: bool = True, **kwargs):
-        super().__init__(**kwargs)
-        if vision_config is None:
-            vision_config = {}
-            logger.info("vision_config is None. initializing the vision config with default values.")
-
-        if text_config is None:
-            text_config = {}
-            logger.info("text_config is None. Initializing the text config with default values.")
-
-        self.vision_config = CLIPVisionConfig(**vision_config)
-        if "architectures" in text_config.keys() and text_config["architectures"] != None:
-            if text_config["architectures"][0] == "MPTForCausalLM":
-                self.text_config = MPTConfig(**text_config)
-            elif text_config["architectures"][0] == "MosaicGPT":
-                self.text_config = MosaicGPTConfig(**text_config)
-            elif text_config["architectures"][0] == "RWForCausalLM":
-                self.text_config = RWConfig(**text_config)
-            elif text_config["architectures"][0] == "LlamaForCausalLM":
-                self.text_config = CONFIG_MAPPING[text_config.pop("model_type")](**text_config)
-            else:
-                import pdb
-
-                pdb.set_trace()
-        else:
-            self.text_config = CONFIG_MAPPING[text_config.pop("model_type")](**text_config)
-
-        self.cross_attn_every_n_layers = cross_attn_every_n_layers
-        self.use_media_placement_augmentation = use_media_placement_augmentation
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["vision_config"] = self.vision_config.to_dict()
-        output["text_config"] = self.text_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        output["cross_attn_every_n_layers"] = self.cross_attn_every_n_layers
-        output["use_media_placement_augmentation"] = self.use_media_placement_augmentation
-        return output
diff --git a/mllm/flamingo/converting_flamingo_to_bf16.py b/mllm/flamingo/converting_flamingo_to_bf16.py
deleted file mode 100644
index 8f9a38902242abfa6b76c3cbe45d69eede62bce0..0000000000000000000000000000000000000000
--- a/mllm/flamingo/converting_flamingo_to_bf16.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import argparse
-import os
-
-import torch
-
-from .configuration_flamingo import FlamingoConfig
-from .modeling_flamingo import FlamingoForConditionalGeneration
-
-parser = argparse.ArgumentParser(description="Load model with precision")
-parser.add_argument("--load_bit", type=str, choices=["fp16", "bf16"], required=True, help="Choose either 'fp16' or 'bf16'")
-parser.add_argument("--pretrained_model_path", type=str, default="/home/luodian/projects/checkpoints/flamingo-mpt-7B-instruct-init", required=True)
-parser.add_argument("--saved_model_path", type=str, default="/home/luodian/projects/checkpoints/flamingo-mpt-7B-instruct-init", required=True)
-args = parser.parse_args()
-
-load_bit = args.load_bit
-pretrained_model_path = args.pretrained_model_path
-
-if load_bit == "fp16":
-    precision = {"torch_dtype": torch.float16}
-elif load_bit == "bf16":
-    precision = {"torch_dtype": torch.bfloat16}
-
-root_dir = os.environ["AZP"]
-print(root_dir)
-device_id = "cpu"
-model = FlamingoForConditionalGeneration.from_pretrained(pretrained_model_path, device_map={"": device_id}, **precision)
-
-# save model to same folder
-checkpoint_path = pretrained_model_path + f"-{load_bit}"
-model.save_pretrained(checkpoint_path, max_shard_size="10GB")
diff --git a/mllm/flamingo/converting_flamingo_to_hf.py b/mllm/flamingo/converting_flamingo_to_hf.py
deleted file mode 100644
index 896053d331d8978e30e13e2a22abb6d7ef704528..0000000000000000000000000000000000000000
--- a/mllm/flamingo/converting_flamingo_to_hf.py
+++ /dev/null
@@ -1,61 +0,0 @@
-"""convert from otter pt to otter hf. Will remove after we use otter hf model to train.
-"""
-
-import re
-import argparse
-import os
-
-import torch
-import torch.nn as nn
-from transformers import CLIPVisionModel, LlamaForCausalLM, LlamaTokenizer
-
-import sys
-from modeling_flamingo import FlamingoForConditionalGeneration
-
-from configuration_flamingo import FlamingoConfig
-
-
-@torch.no_grad()
-def dump_hf_model(pretrained_model_path: str, old_ckpt_path: str, new_folder_path: str) -> None:
-    old_ckpt = torch.load(old_ckpt_path, map_location="cpu")
-    if old_ckpt.get("model_state_dict", None) is not None:
-        old_ckpt = old_ckpt["model_state_dict"]
-    new_ckpt = old_ckpt
-    folder_path = os.path.dirname(old_ckpt_path)
-    # config_path = os.path.join(folder_path, "config.json") if os.path.exists(os.path.join(folder_path, "config.json")) else "flamingo/config.json"
-    model = FlamingoForConditionalGeneration.from_pretrained(
-        args.pretrained_model_path,
-        device_map="auto",
-    )
-    _ = model.load_state_dict(new_ckpt, strict=False)
-    print(f"Saving HF model to {new_folder_path}")
-    model.save_pretrained(new_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--old_ckpt_path",
-        "-old",
-        type=str,
-        required=True,
-        help="Path to the pt checkpoint",
-    )
-    parser.add_argument(
-        "--new_hf_path",
-        "-new",
-        type=str,
-        required=True,
-        help="Path to the hf folder",
-    )
-    parser.add_argument(
-        "--pretrained_model_path",
-        "-pretrained",
-        type=str,
-        required=True,
-        help="Path to the pretrained model folder",
-    )
-    args = parser.parse_args()
-    if not os.path.exists(os.path.dirname(args.new_hf_path)):
-        os.makedirs(os.path.dirname(args.new_hf_path))
-    dump_hf_model(args.pretrained_model_path, args.old_ckpt_path, args.new_hf_path)
diff --git a/mllm/flamingo/converting_flamingo_to_lora.py b/mllm/flamingo/converting_flamingo_to_lora.py
deleted file mode 100644
index a5357d33dea1659bd397ea4d1bdc19af362b8153..0000000000000000000000000000000000000000
--- a/mllm/flamingo/converting_flamingo_to_lora.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import argparse
-import torch
-import sys
-
-from .modeling_flamingo import FlamingoForConditionalGeneration
-from peft import get_peft_model, LoraConfig, TaskType
-
-MODEL_CLASSES = {
-    "LlamaForCausalLM": "llama",
-    "OPTForCausalLM": "opt",
-    "GPTJForCausalLM": "gptj",
-    "GPTNeoXForCausalLM": "gpt_neox",
-    "MPTForCausalLM": "mpt",
-}
-
-# Define argument parser
-parser = argparse.ArgumentParser(description="Load a model with specified precision and save it to a specified path.")
-
-# Add arguments
-parser.add_argument(
-    "--checkpoint_path",
-    type=str,
-    help="Path to the pre-trained model checkpoint.",
-    default="",
-)
-parser.add_argument(
-    "--save_path",
-    type=str,
-    default="",
-    help="Path to the converted model checkpoint.",
-)
-
-# Parse the input arguments
-args = parser.parse_args()
-
-load_bit = "bf16"
-if load_bit == "fp16":
-    precision = {"torch_dtype": torch.float16}
-elif load_bit == "bf16":
-    precision = {"torch_dtype": torch.bfloat16}
-
-# Load the model
-model = FlamingoForConditionalGeneration.from_pretrained(args.checkpoint_path, device_map="auto", **precision)
-
-# adding lora
-standard_modules = ["q_proj", "v_proj"]
-lang_encoder_short_name = MODEL_CLASSES[model.config.text_config.architectures[0]]
-model_to_lora_modules = {
-    "llama": standard_modules,
-    "opt": standard_modules,
-    "gptj": standard_modules,
-    "gpt_neox": ["query_key_value"],
-    "mpt": ["Wqkv"],
-}
-lora_config = LoraConfig(
-    r=16,
-    lora_alpha=32,
-    lora_dropout=0.05,
-    task_type=TaskType.CAUSAL_LM,
-    target_modules=model_to_lora_modules[lang_encoder_short_name],
-)
-model.config.update({"lora_config": {"r": 16, "lora_alpha": 32, "lora_dropout": 0.05}})
-model.lang_encoder = get_peft_model(model.lang_encoder, lora_config)
-model.lang_encoder.print_trainable_parameters()
-
-# Save the model
-checkpoint_path = args.save_path
-FlamingoForConditionalGeneration.save_pretrained(model, checkpoint_path)
diff --git a/mllm/flamingo/falcon/__init__.py b/mllm/flamingo/falcon/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/mllm/flamingo/falcon/__pycache__/__init__.cpython-39.pyc b/mllm/flamingo/falcon/__pycache__/__init__.cpython-39.pyc
deleted file mode 100644
index 90f87b0fe8b252c75261cd040f782f79b786c24b..0000000000000000000000000000000000000000
Binary files a/mllm/flamingo/falcon/__pycache__/__init__.cpython-39.pyc and /dev/null differ
diff --git a/mllm/flamingo/falcon/__pycache__/configuration_RW.cpython-39.pyc b/mllm/flamingo/falcon/__pycache__/configuration_RW.cpython-39.pyc
deleted file mode 100644
index e1907bb88151a8dcacd7a42b19c9bfa42272d746..0000000000000000000000000000000000000000
Binary files a/mllm/flamingo/falcon/__pycache__/configuration_RW.cpython-39.pyc and /dev/null differ
diff --git a/mllm/flamingo/falcon/__pycache__/modelling_RW.cpython-39.pyc b/mllm/flamingo/falcon/__pycache__/modelling_RW.cpython-39.pyc
deleted file mode 100644
index c268293bb358403326f1e9356390ed486a652180..0000000000000000000000000000000000000000
Binary files a/mllm/flamingo/falcon/__pycache__/modelling_RW.cpython-39.pyc and /dev/null differ
diff --git a/mllm/flamingo/falcon/configuration_RW.py b/mllm/flamingo/falcon/configuration_RW.py
deleted file mode 100644
index ab845c0d34462b3b3391d3b8119c11816e9c02ce..0000000000000000000000000000000000000000
--- a/mllm/flamingo/falcon/configuration_RW.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# coding=utf-8
-# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Bloom configuration"""
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-class RWConfig(PretrainedConfig):
-    model_type = "RefinedWebModel"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    attribute_map = {
-        "num_hidden_layers": "n_layer",
-        "num_attention_heads": "n_head",
-    }
-
-    def __init__(
-        self,
-        vocab_size=250880,
-        hidden_size=64,
-        n_layer=2,
-        n_head=8,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-        use_cache=True,
-        bos_token_id=1,
-        eos_token_id=2,
-        apply_residual_connection_post_layernorm=False,
-        hidden_dropout=0.0,
-        attention_dropout=0.0,
-        multi_query=False,
-        alibi=False,
-        bias=False,
-        parallel_attn=False,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        # Backward compatibility with n_embed kwarg
-        n_embed = kwargs.pop("n_embed", None)
-        self.hidden_size = hidden_size if n_embed is None else n_embed
-        self.n_layer = n_layer
-        self.n_head = n_head
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.use_cache = use_cache
-        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
-        self.hidden_dropout = hidden_dropout
-        self.attention_dropout = attention_dropout
-
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-        self.multi_query = multi_query
-        self.alibi = alibi
-        self.bias = bias
-        self.parallel_attn = parallel_attn
-
-        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
-
-    @property
-    def head_dim(self):
-        return self.hidden_size // self.n_head
-
-    @property
-    def rotary(self):
-        return not self.alibi
diff --git a/mllm/flamingo/falcon/modelling_RW.py b/mllm/flamingo/falcon/modelling_RW.py
deleted file mode 100644
index b929a5f5bac5789e454f46bc1df4850821fed45d..0000000000000000000000000000000000000000
--- a/mllm/flamingo/falcon/modelling_RW.py
+++ /dev/null
@@ -1,1064 +0,0 @@
-# port of models described in RW
-# We use the bloom model as a starting point for these model.
-# Please refer to the bloom models for usage instructions.
-
-import math
-import warnings
-from typing import Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
-from torch.nn import functional as F
-
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    CausalLMOutputWithCrossAttentions,
-    QuestionAnsweringModelOutput,
-    SequenceClassifierOutputWithPast,
-    TokenClassifierOutput,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging
-from .configuration_RW import RWConfig
-
-logger = logging.get_logger(__name__)
-
-
-# NOTE(Hesslow): Unfortunately we did not fuse matmul and bias during training, this means that there's one additional quantization to bfloat16 between the operations.
-# In order not to degrade the quality of our HF-port, we keep these characteristics in the final model.
-class Linear(nn.Linear):
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        ret = input @ self.weight.T
-        if self.bias is None:
-            return ret
-        else:
-            return ret + self.bias
-
-
-from einops import rearrange
-
-
-# rotary pos emb helpers (torch.jit.script does not seem to support staticmethod...)
-def rotate_half(x):
-    x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=x1.ndim - 1)  # dim=-1 triggers a bug in torch < 1.8.0
-
-
-class RotaryEmbedding(torch.nn.Module):
-    """Implementation of RotaryEmbedding from GPT-NeoX.
-    This implementation is design to operate on queries and keys that are compatible with
-    [batch_size, n_heads_per_partition, seq_len, head_dim] (e.g. MinGPTAttention format).
-    """
-
-    def __init__(
-        self,
-        head_dim: int,
-        base=10000,
-    ):
-        super().__init__()
-        inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2).float() / head_dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.head_dim = head_dim
-        self.seq_len_cached = None
-        self.batch_size_cached = None
-        self.cos_cached: torch.Tensor | None = None
-        self.sin_cached: torch.Tensor | None = None
-
-    def cos_sin(
-        self,
-        seq_len: int,
-        device="cuda",
-        dtype=torch.bfloat16,
-    ) -> torch.Tensor:
-        if seq_len != self.seq_len_cached:
-            self.seq_len_cached = seq_len
-            t = torch.arange(seq_len, device=device).type_as(self.inv_freq)
-            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            emb = torch.cat((freqs, freqs), dim=-1).to(device)
-
-            if dtype in [torch.float16, torch.bfloat16]:
-                emb = emb.float()
-
-            self.cos_cached = emb.cos()[None, :, :]
-            self.sin_cached = emb.sin()[None, :, :]
-
-            self.cos_cached = self.cos_cached.type(dtype)
-            self.sin_cached = self.sin_cached.type(dtype)
-
-        return self.cos_cached, self.sin_cached
-
-    def forward(self, q, k):
-        batch, seq_len, head_dim = q.shape
-        cos, sin = self.cos_sin(seq_len, q.device, q.dtype)
-        return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
-
-
-def _make_causal_mask(input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int) -> torch.BoolTensor:
-    batch_size, target_length = input_ids_shape
-    mask = torch.empty((target_length, target_length + past_key_values_length), dtype=torch.bool, device=device)
-    # ONNX doesn't support `torch.Tensor.triu` properly, thus we use this workaround
-    seq_ids = torch.arange(target_length, device=device)
-    mask[:, past_key_values_length:] = seq_ids[:, None] < seq_ids[None, :]
-
-    if past_key_values_length > 0:
-        mask[:, :past_key_values_length] = False
-
-    expanded_mask = mask[None, None, :, :].expand(batch_size, 1, target_length, target_length + past_key_values_length)
-    return expanded_mask
-
-
-def _expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor:
-    batch_size, src_length = mask.shape
-    tgt_length = tgt_length if tgt_length is not None else src_length
-
-    expanded_mask = ~(mask[:, None, None, :].to(torch.bool))
-    return expanded_mask.expand(batch_size, 1, tgt_length, src_length)
-
-
-def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor:
-    batch_size, seq_length = attention_mask.shape
-    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
-    base = torch.tensor(2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32)
-    powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
-    slopes = torch.pow(base, powers)
-
-    if closest_power_of_2 != num_heads:
-        extra_base = torch.tensor(2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32)
-        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
-        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
-        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
-
-    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
-    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
-    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
-    # => the query_length dimension will then be broadcasted correctly
-    # This is more or less identical to T5's relative position bias:
-    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
-    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
-    alibi = slopes[..., None].bfloat16() * arange_tensor
-    return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
-
-
-def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor:
-    out = F.dropout(x, p=prob, training=training)
-    out = residual + out
-    return out
-
-
-class Attention(nn.Module):
-    def __init__(self, config: RWConfig):
-        super().__init__()
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.n_head
-        self.head_dim = self.hidden_size // self.num_heads
-        self.split_size = self.hidden_size
-        self.hidden_dropout = config.hidden_dropout
-
-        if self.head_dim * self.num_heads != self.hidden_size:
-            raise ValueError(f"`hidden_size` must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:" f" {self.num_heads}).")
-
-        self.maybe_rotary = RotaryEmbedding(config.head_dim) if config.rotary else lambda q, k: (q, k)
-
-        # Layer-wise attention scaling
-        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
-        self.beta = self.inv_norm_factor
-
-        self.query_key_value = Linear(
-            self.hidden_size,
-            3 * self.hidden_size if not config.multi_query else (self.hidden_size + 2 * self.head_dim),
-            bias=config.bias,
-        )
-        self.multi_query = config.multi_query
-        self.dense = Linear(self.hidden_size, self.hidden_size, bias=config.bias)
-        self.attention_dropout = nn.Dropout(config.attention_dropout)
-        self.num_kv = config.n_head if not self.multi_query else 1
-
-    def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory
-        storage as `fused_qkv`
-
-        Args:
-            fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
-
-        Returns:
-            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
-            value: [batch_size, seq_length, num_heads, head_dim]
-        """
-        if not self.multi_query:
-            batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
-            fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim)
-            return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :]
-        else:
-            batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
-            fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads + 2, self.head_dim)
-            return fused_qkv[..., :-2, :], fused_qkv[..., [-2], :], fused_qkv[..., [-1], :]
-
-    def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Merge heads together over the last dimenstion
-
-        Args:
-            x: (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
-
-        Returns:
-            torch.tensor: [batch_size, seq_length, num_heads * head_dim]
-        """
-        # What we want to achieve is:
-        # batch_size * num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads * head_dim
-        batch_size_and_num_heads, seq_length, _ = x.shape
-        batch_size = batch_size_and_num_heads // self.num_heads
-
-        # First view to decompose the batch size
-        # batch_size * num_heads, seq_length, head_dim -> batch_size, num_heads, seq_length, head_dim
-        x = x.view(batch_size, self.num_heads, seq_length, self.head_dim)
-
-        # batch_size, num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads, head_dim
-        x = x.permute(0, 2, 1, 3)
-
-        # batch_size, seq_length, num_heads, head_dim -> batch_size, seq_length, num_heads * head_dim
-        return x.reshape(batch_size, seq_length, self.num_heads * self.head_dim)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        alibi: torch.Tensor,
-        attention_mask: torch.Tensor,
-        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        use_cache: bool = False,
-        output_attentions: bool = False,
-    ):
-        fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
-
-        # 3 x [batch_size, seq_length, num_heads, head_dim]
-        (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
-
-        batch_size, q_length, _, _ = query_layer.shape
-
-        query_layer = query_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim)
-        key_layer = key_layer.transpose(1, 2).reshape(
-            batch_size * self.num_kv,
-            q_length,
-            self.head_dim,
-        )
-        value_layer = value_layer.transpose(1, 2).reshape(batch_size * self.num_kv, q_length, self.head_dim)
-
-        query_layer, key_layer = self.maybe_rotary(query_layer, key_layer)
-
-        if layer_past is not None:
-            past_key, past_value = layer_past
-            # concatenate along seq_length dimension:
-            #  - key: [batch_size * self.num_heads, head_dim, kv_length]
-            #  - value: [batch_size * self.num_heads, kv_length, head_dim]
-            key_layer = torch.cat((past_key, key_layer), dim=1)
-            value_layer = torch.cat((past_value, value_layer), dim=1)
-
-        _, kv_length, _ = key_layer.shape
-
-        if use_cache is True:
-            present = (key_layer, value_layer)
-        else:
-            present = None
-
-        if alibi is None:
-            query_layer_ = query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim)
-            key_layer_ = key_layer.reshape(batch_size, self.num_kv, -1, self.head_dim)
-            value_layer_ = value_layer.reshape(batch_size, self.num_kv, -1, self.head_dim)
-
-            attn_output = F.scaled_dot_product_attention(query_layer_, key_layer_, value_layer_, None, 0.0, is_causal=True)
-
-            x = attn_output.view(batch_size, self.num_heads, q_length, self.head_dim)
-            x = x.permute(0, 2, 1, 3)
-            attn_output = x.reshape(batch_size, q_length, self.num_heads * self.head_dim)
-
-            output_tensor = self.dense(attn_output)
-
-            outputs = (output_tensor, present)
-            assert not output_attentions  # not supported.
-            return outputs
-        else:
-            attention_mask_float = (attention_mask * 1.0).masked_fill(attention_mask, -1e9).to(torch.bfloat16)
-            matmul_result = query_layer @ key_layer.transpose(-1, -2)
-
-            # change view to [batch_size, num_heads, q_length, kv_length]
-            attention_scores = matmul_result.view(batch_size, self.num_heads, q_length, kv_length)
-
-            # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
-            input_dtype = attention_scores.dtype
-            # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
-            if input_dtype == torch.float16 or input_dtype == torch.bfloat16:
-                attention_scores = attention_scores.to(torch.float32)
-            # attn_weights = torch.masked_fill(attention_scores, attention_mask, torch.finfo(attention_scores.dtype).min)
-            attention_probs = F.softmax(
-                (attention_scores + alibi.view(batch_size, self.num_heads, 1, -1)) * self.inv_norm_factor + attention_mask_float,
-                dim=-1,
-                dtype=hidden_states.dtype,
-            )
-            # [batch_size, num_heads, q_length, kv_length]
-            attention_probs = self.attention_dropout(attention_probs)
-
-            if head_mask is not None:
-                attention_probs = attention_probs * head_mask
-
-            # change view [batch_size x num_heads, q_length, kv_length]
-            attention_probs_reshaped = attention_probs.view(batch_size * self.num_heads, q_length, kv_length)
-
-            # matmul: [batch_size * num_heads, q_length, head_dim]
-            context_layer = attention_probs_reshaped @ value_layer
-
-            # change view [batch_size, num_heads, q_length, head_dim]
-            context_layer = self._merge_heads(context_layer)
-
-            output_tensor = self.dense(context_layer)
-
-            outputs = (output_tensor, present)
-            if output_attentions:
-                outputs += (attention_probs,)
-
-            return outputs
-
-
-class MLP(nn.Module):
-    def __init__(self, config: RWConfig):
-        super().__init__()
-        hidden_size = config.hidden_size
-
-        self.dense_h_to_4h = Linear(hidden_size, 4 * hidden_size, bias=config.bias)
-        self.act = nn.GELU()
-        self.dense_4h_to_h = Linear(4 * hidden_size, hidden_size, bias=config.bias)
-        self.hidden_dropout = config.hidden_dropout
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.act(self.dense_h_to_4h(x))
-        x = self.dense_4h_to_h(x)
-        return x
-
-
-class DecoderLayer(nn.Module):
-    def __init__(self, config: RWConfig):
-        super().__init__()
-        hidden_size = config.hidden_size
-
-        self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.num_heads = config.n_head
-        self.self_attention = Attention(config)
-
-        if not config.parallel_attn:
-            # unused if parallel attn
-            self.post_attention_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-
-        self.mlp = MLP(config)
-
-        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
-        self.hidden_dropout = config.hidden_dropout
-
-        self.config = config
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        alibi: torch.Tensor,
-        attention_mask: torch.Tensor,
-        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        use_cache: bool = False,
-        output_attentions: bool = False,
-    ):
-        layernorm_output = self.input_layernorm(hidden_states)
-        residual = hidden_states
-
-        # Self attention.
-        attn_outputs = self.self_attention(
-            layernorm_output,
-            layer_past=layer_past,
-            attention_mask=attention_mask,
-            alibi=alibi,
-            head_mask=head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-
-        attention_output = attn_outputs[0]
-
-        if not self.config.parallel_attn:
-            residual = dropout_add(attention_output, residual, self.config.attention_dropout, training=self.training)
-            layernorm_output = self.post_attention_layernorm(residual)
-
-        outputs = attn_outputs[1:]
-
-        # MLP.
-        mlp_output = self.mlp(layernorm_output)
-
-        if self.config.parallel_attn:
-            mlp_output += attention_output
-
-        output = dropout_add(mlp_output, residual, self.config.hidden_dropout, training=self.training)
-
-        if use_cache:
-            outputs = (output,) + outputs
-        else:
-            outputs = (output,) + outputs[1:]
-
-        return outputs  # hidden_states, present, attentions
-
-
-class RWPreTrainedModel(PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = RWConfig
-    base_model_prefix = "transformer"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["DecoderLayer"]
-
-    def __init__(self, *inputs, **kwargs):
-        super().__init__(*inputs, **kwargs)
-
-    def _init_weights(self, module: nn.Module):
-        """Initialize the weights."""
-        if isinstance(module, nn.Linear) or isinstance(module, Linear):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-    def _set_gradient_checkpointing(self, module: nn.Module, value: bool = False):
-        if isinstance(module, RWModel):
-            module.gradient_checkpointing = value
-
-    @staticmethod
-    def _convert_to_standard_cache(past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]], batch_size: int) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
-        """
-        Standardizes the format of the cache so as to match most implementations, i.e. to tuple(tuple([batch_size,
-        num_heads, ...]))
-        """
-        batch_size_times_num_heads, head_dim, seq_length = past_key_value[0][0].shape
-        num_heads = batch_size_times_num_heads // batch_size
-        # key: [batch_size * num_heads, head_dim, seq_length] -> [batch_size, num_heads, head_dim, seq_length]
-        # value: [batch_size * num_heads, seq_length, head_dim] -> [batch_size, num_heads, seq_length, head_dim]
-        return tuple(
-            (
-                layer_past[0].view(batch_size, num_heads, head_dim, seq_length),
-                layer_past[1].view(batch_size, num_heads, seq_length, head_dim),
-            )
-            for layer_past in past_key_value
-        )
-
-    @staticmethod
-    def _convert_to_rw_cache(past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]]) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
-        batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
-        batch_size_times_num_heads = batch_size * num_heads
-        # key:  [batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length]
-        # value: [batch_size, num_heads, seq_length, head_dim] -> [batch_size * num_heads, seq_length, head_dim]
-        return tuple(
-            (
-                layer_past[0].view(batch_size_times_num_heads, head_dim, seq_length),
-                layer_past[1].view(batch_size_times_num_heads, seq_length, head_dim),
-            )
-            for layer_past in past_key_value
-        )
-
-
-class RWModel(RWPreTrainedModel):
-    def __init__(self, config: RWConfig):
-        super().__init__(config)
-
-        self.embed_dim = config.hidden_size
-        self.num_heads = config.n_head
-        self.alibi = config.alibi
-
-        # Embedding + LN Embedding
-        self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim)
-
-        # Transformer blocks
-        self.h = nn.ModuleList([DecoderLayer(config) for _ in range(config.num_hidden_layers)])
-
-        # Final Layer Norm
-        self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-
-        self.gradient_checkpointing = False
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.word_embeddings
-
-    def _prepare_attn_mask(self, attention_mask: torch.Tensor, input_shape: Tuple[int, int], past_key_values_length: int) -> torch.BoolTensor:
-        # create causal mask
-        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
-        combined_attention_mask = None
-        device = attention_mask.device
-        _, src_length = input_shape
-
-        if src_length > 1:
-            combined_attention_mask = _make_causal_mask(input_shape, device=device, past_key_values_length=past_key_values_length)
-
-        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
-        expanded_attn_mask = _expand_mask(attention_mask, tgt_length=src_length)
-        combined_attention_mask = expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask
-
-        return combined_attention_mask
-
-    def set_input_embeddings(self, new_embeddings: torch.Tensor):
-        self.word_embeddings = new_embeddings
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
-        if deprecated_arguments.pop("position_ids", False) is not False:
-            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
-            warnings.warn(
-                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" " passing `position_ids`.",
-                FutureWarning,
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if past_key_values is None:
-            past_key_values = tuple([None] * len(self.h))
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape batch_size x num_heads x N x N
-        # head_mask has shape n_layer x batch x num_heads x N x N
-        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-
-        hidden_states = inputs_embeds
-
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
-
-        # Compute alibi tensor: check build_alibi_tensor documentation
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-        if past_key_values[0] is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-        if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device)
-        else:
-            attention_mask = attention_mask.to(hidden_states.device)
-
-        if self.alibi:
-            alibi = build_alibi_tensor(attention_mask, self.num_heads, dtype=hidden_states.dtype)
-        else:
-            alibi = None
-
-        causal_mask = self._prepare_attn_mask(
-            attention_mask,
-            input_shape=(batch_size, seq_length),
-            past_key_values_length=past_key_values_length,
-        )
-
-        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                if use_cache:
-                    logger.warning("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
-                    use_cache = False
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, use_cache=use_cache, output_attentions=output_attentions)
-
-                    return custom_forward
-
-                outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    alibi,
-                    causal_mask,
-                    head_mask[i],
-                )
-            else:
-                outputs = block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=causal_mask,
-                    head_mask=head_mask[i],
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                    alibi=alibi,
-                )
-
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
-
-        # Add last hidden state
-        hidden_states = self.ln_f(hidden_states)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
-
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-
-class RWForCausalLM(RWPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-
-    def __init__(self, config: RWConfig):
-        super().__init__(config)
-        self.transformer = RWModel(config)
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings: torch.Tensor):
-        self.lm_head = new_embeddings
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids: torch.LongTensor,
-        past: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> dict:
-        # only last token for input_ids if past is not None
-        if past:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-
-            # the cache may be in the stardard format (e.g. in contrastive search), convert to our's format if needed
-            if past[0][0].shape[0] == input_ids.shape[0]:
-                past = self._convert_to_rw_cache(past)
-
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past,
-            "use_cache": kwargs.get("use_cache"),
-            "attention_mask": attention_mask,
-        }
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
-            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
-        """
-        if deprecated_arguments.pop("position_ids", False) is not False:
-            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
-            warnings.warn(
-                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" " passing `position_ids`.",
-                FutureWarning,
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            batch_size, seq_length, vocab_size = shift_logits.shape
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length))
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-    def _reorder_cache(self, past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
-        """
-        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
-        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
-        beam_idx at every generation step.
-
-        Output shares the same memory storage as `past`.
-        """
-        standardized_past = self._convert_to_standard_cache(past, batch_size=len(beam_idx))
-
-        # Get a copy of `beam_idx` on all the devices where we need those indices.
-        device_to_beam_idx = {past_state.device: beam_idx.to(past_state.device) for layer_past in past for past_state in layer_past}
-        reordered_past = tuple(
-            (
-                layer_past[0].index_select(0, device_to_beam_idx[layer_past[0].device]),
-                layer_past[1].index_select(0, device_to_beam_idx[layer_past[0].device]),
-            )
-            for layer_past in standardized_past
-        )
-        return self._convert_to_rw_cache(reordered_past)
-
-
-class RWForSequenceClassification(RWPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-
-    def __init__(self, config: RWConfig):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.transformer = RWModel(config)
-        self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        if deprecated_arguments.pop("position_ids", False) is not False:
-            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
-            warnings.warn(
-                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" " passing `position_ids`.",
-                FutureWarning,
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(dim=-1) - 1
-            else:
-                sequence_lengths = -1
-                logger.warning(
-                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
-                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
-                )
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-
-class RWForTokenClassification(RWPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-
-    def __init__(self, config: RWConfig):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.transformer = RWModel(config)
-        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
-            classifier_dropout = config.classifier_dropout
-        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
-            classifier_dropout = config.hidden_dropout
-        else:
-            classifier_dropout = 0.1
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        if deprecated_arguments.pop("position_ids", False) is not False:
-            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
-            warnings.warn(
-                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" " passing `position_ids`.",
-                FutureWarning,
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = transformer_outputs[0]
-        hidden_states = self.dropout(hidden_states)
-        logits = self.classifier(hidden_states)
-
-        loss = None
-        if labels is not None:
-            batch_size, seq_length = labels.shape
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length))
-
-        if not return_dict:
-            output = (logits,) + transformer_outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
-        return TokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-
-class RWForQuestionAnswering(RWPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.transformer = RWModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, 2)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        start_positions: Optional[torch.LongTensor] = None,
-        end_positions: Optional[torch.LongTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
-        r"""
-        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1).contiguous()
-        end_logits = end_logits.squeeze(-1).contiguous()
-
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions = end_positions.clamp(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
-            return ((total_loss,) + output) if total_loss is not None else output
-
-        return QuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
diff --git a/mllm/flamingo/flamingo-falcon-7B.json b/mllm/flamingo/flamingo-falcon-7B.json
deleted file mode 100644
index f777f6b4f4c9a374830c195b84439b2541ba5cb3..0000000000000000000000000000000000000000
--- a/mllm/flamingo/flamingo-falcon-7B.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "_commit_hash": null,
-  "architectures": [
-    "FlamingoModel"
-  ],
-  "cross_attn_every_n_layers": 4,
-  "model_type": "flamingo",
-  "text_config": {
-    "architectures": [
-      "RWForCausalLM"
-    ],
-    "apply_residual_connection_post_layernorm": false,
-    "attention_dropout": 0.0,
-    "bias": false,
-    "bos_token_id": 11,
-    "eos_token_id": 11,
-    "hidden_dropout": 0.0,
-    "hidden_size": 4544,
-    "initializer_range": 0.02,
-    "layer_norm_epsilon": 1e-05,
-    "model_type": "RefinedWebModel",
-    "multi_query": true,
-    "n_head": 71,
-    "n_layer": 32,
-    "parallel_attn": true,
-    "torch_dtype": "bfloat16",
-    "transformers_version": "4.27.4",
-    "use_cache": true,
-    "vocab_size": 65024
-  },
-  "tie_word_embeddings": false,
-  "torch_dtype": "float32",
-  "transformers_version": null,
-  "use_media_placement_augmentation": true,
-  "vision_config": {
-    "_name_or_path": "openai/clip-vit-large-patch14",
-    "add_cross_attention": false,
-    "architectures": null,
-    "attention_dropout": 0.0,
-    "bad_words_ids": null,
-    "begin_suppress_tokens": null,
-    "bos_token_id": null,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "early_stopping": false,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": null,
-    "exponential_decay_length_penalty": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "hidden_act": "quick_gelu",
-    "hidden_size": 1024,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
-    "image_size": 224,
-    "initializer_factor": 1.0,
-    "initializer_range": 0.02,
-    "intermediate_size": 4096,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
-    "layer_norm_eps": 1e-05,
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "min_length": 0,
-    "model_type": "clip_vision_model",
-    "no_repeat_ngram_size": 0,
-    "num_attention_heads": 16,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_channels": 3,
-    "num_hidden_layers": 24,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": null,
-    "patch_size": 14,
-    "prefix": null,
-    "problem_type": null,
-    "projection_dim": 512,
-    "pruned_heads": {},
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "sep_token_id": null,
-    "suppress_tokens": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tf_legacy_loss": false,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torch_dtype": null,
-    "torchscript": false,
-    "transformers_version": "4.28.1",
-    "typical_p": 1.0,
-    "use_bfloat16": false
-  }
-}
diff --git a/mllm/flamingo/flamingo-llama2-chat-13B.json b/mllm/flamingo/flamingo-llama2-chat-13B.json
deleted file mode 100644
index 738211da747fe39163e677cfcff6fc646d8c720d..0000000000000000000000000000000000000000
--- a/mllm/flamingo/flamingo-llama2-chat-13B.json
+++ /dev/null
@@ -1,114 +0,0 @@
-{
-    "_commit_hash": null,
-    "architectures": [
-        "FlamingoForConditionalGeneration"
-    ],
-    "cross_attn_every_n_layers": 8,
-    "model_type": "flamingo",
-    "text_config": {
-        "_name_or_path": "meta-llama/Llama-2-13b-chat-hf",
-        "architectures": [
-            "LlamaForCausalLM"
-        ],
-        "bos_token_id": 1,
-        "eos_token_id": 2,
-        "hidden_act": "silu",
-        "hidden_size": 5120,
-        "initializer_range": 0.02,
-        "intermediate_size": 13824,
-        "max_position_embeddings": 4096,
-        "model_type": "llama",
-        "num_attention_heads": 40,
-        "num_hidden_layers": 40,
-        "num_key_value_heads": 40,
-        "pad_token_id": 0,
-        "pretraining_tp": 1,
-        "rms_norm_eps": 1e-05,
-        "rope_scaling": null,
-        "tie_word_embeddings": false,
-        "torch_dtype": "float16",
-        "transformers_version": "4.30.1",
-        "use_cache": true,
-        "vocab_size": 32000
-    },
-    "torch_dtype": "float32",
-    "transformers_version": null,
-    "use_media_placement_augmentation": true,
-    "vision_config": {
-        "_name_or_path": "openai/clip-vit-large-patch14",
-        "add_cross_attention": false,
-        "architectures": null,
-        "attention_dropout": 0.0,
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_act": "quick_gelu",
-        "hidden_size": 1024,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "image_size": 224,
-        "initializer_factor": 1.0,
-        "initializer_range": 0.02,
-        "intermediate_size": 4096,
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "layer_norm_eps": 1e-05,
-        "length_penalty": 1.0,
-        "max_length": 20,
-        "min_length": 0,
-        "model_type": "clip_vision_model",
-        "no_repeat_ngram_size": 0,
-        "num_attention_heads": 16,
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_channels": 3,
-        "num_hidden_layers": 24,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "patch_size": 14,
-        "prefix": null,
-        "problem_type": null,
-        "projection_dim": 512,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": null,
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false
-    }
-}
\ No newline at end of file
diff --git a/mllm/flamingo/flamingo-llama2-chat-7B.json b/mllm/flamingo/flamingo-llama2-chat-7B.json
deleted file mode 100644
index 0676c97e29e31cf4731839ed1c8216f16349a2ad..0000000000000000000000000000000000000000
--- a/mllm/flamingo/flamingo-llama2-chat-7B.json
+++ /dev/null
@@ -1,115 +0,0 @@
-{
-    "_commit_hash": null,
-    "architectures": [
-        "FlamingoForConditionalGeneration"
-    ],
-    "cross_attn_every_n_layers": 4,
-    "model_type": "flamingo",
-    "text_config": {
-        "_name_or_path": "meta-llama/Llama-2-7b-chat-hf",
-        "architectures": [
-            "LlamaForCausalLM"
-        ],
-        "bos_token_id": 1,
-        "eos_token_id": 2,
-        "hidden_act": "silu",
-        "hidden_size": 4096,
-        "initializer_range": 0.02,
-        "intermediate_size": 11008,
-        "max_length": 4096,
-        "max_position_embeddings": 2048,
-        "model_type": "llama",
-        "num_attention_heads": 32,
-        "num_hidden_layers": 32,
-        "num_key_value_heads": 32,
-        "pad_token_id": 0,
-        "pretraining_tp": 1,
-        "rms_norm_eps": 1e-05,
-        "rope_scaling": null,
-        "tie_word_embeddings": false,
-        "torch_dtype": "float16",
-        "transformers_version": "4.32.0.dev0",
-        "use_cache": true,
-        "vocab_size": 32000
-    },
-    "torch_dtype": "float32",
-    "transformers_version": null,
-    "use_media_placement_augmentation": true,
-    "vision_config": {
-        "_name_or_path": "openai/clip-vit-large-patch14",
-        "add_cross_attention": false,
-        "architectures": null,
-        "attention_dropout": 0.0,
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_act": "quick_gelu",
-        "hidden_size": 1024,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "image_size": 224,
-        "initializer_factor": 1.0,
-        "initializer_range": 0.02,
-        "intermediate_size": 4096,
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "layer_norm_eps": 1e-05,
-        "length_penalty": 1.0,
-        "max_length": 20,
-        "min_length": 0,
-        "model_type": "clip_vision_model",
-        "no_repeat_ngram_size": 0,
-        "num_attention_heads": 16,
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_channels": 3,
-        "num_hidden_layers": 24,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "patch_size": 14,
-        "prefix": null,
-        "problem_type": null,
-        "projection_dim": 512,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": null,
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false
-    }
-}
\ No newline at end of file
diff --git a/mllm/flamingo/flamingo-mpt-1B-redpajama.json b/mllm/flamingo/flamingo-mpt-1B-redpajama.json
deleted file mode 100644
index f27dffdce8936e4785ddca8efb9446f5c635fede..0000000000000000000000000000000000000000
--- a/mllm/flamingo/flamingo-mpt-1B-redpajama.json
+++ /dev/null
@@ -1,131 +0,0 @@
-{
-    "_commit_hash": null,
-    "architectures": [
-        "FlamingoForConditionalGeneration"
-    ],
-    "cross_attn_every_n_layers": 1,
-    "model_type": "flamingo",
-    "text_config": {
-        "_name_or_path": "",
-        "alibi": true,
-        "alibi_bias_max": 8,
-        "architectures": [
-          "MosaicGPT"
-        ],
-        "attn_clip_qkv": null,
-        "attn_impl": "torch",
-        "attn_pdrop": 0,
-        "attn_qk_ln": true,
-        "attn_uses_sequence_id": false,
-        "d_model": 2048,
-        "hidden_size": 2048,
-        "emb_init_std": null,
-        "emb_init_uniform_lim": null,
-        "emb_pdrop": 0,
-        "embedding_fraction": 1.0,
-        "fan_mode": "fan_in",
-        "init_device": "cpu",
-        "init_div_is_residual": true,
-        "init_gain": 0,
-        "init_nonlinearity": "relu",
-        "init_std": 0.02,
-        "logit_scale": null,
-        "low_precision_layernorm": true,
-        "max_seq_len": 2048,
-        "mlp_ratio": 4,
-        "model_type": "mosaic_gpt",
-        "n_heads": 16,
-        "n_layers": 24,
-        "no_bias": true,
-        "param_init_fn": "kaiming_normal_",
-        "prefix_lm": false,
-        "resid_pdrop": 0,
-        "softmax_scale": null,
-        "tokenizer_name": "EleutherAI/gpt-neox-20b",
-        "torch_dtype": "float32",
-        "transformers_version": "4.27.4",
-        "use_cache": false,
-        "verbose": 0,
-        "vocab_size": 50432
-      },
-    "torch_dtype": "float32",
-    "transformers_version": null,
-    "use_media_placement_augmentation": true,
-    "vision_config": {
-        "_name_or_path": "openai/clip-vit-large-patch14",
-        "add_cross_attention": false,
-        "architectures": null,
-        "attention_dropout": 0.0,
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_act": "quick_gelu",
-        "hidden_size": 1024,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "image_size": 224,
-        "initializer_factor": 1.0,
-        "initializer_range": 0.02,
-        "intermediate_size": 4096,
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "layer_norm_eps": 1e-05,
-        "length_penalty": 1.0,
-        "max_length": 20,
-        "min_length": 0,
-        "model_type": "clip_vision_model",
-        "no_repeat_ngram_size": 0,
-        "num_attention_heads": 16,
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_channels": 3,
-        "num_hidden_layers": 24,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "patch_size": 14,
-        "prefix": null,
-        "problem_type": null,
-        "projection_dim": 512,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": null,
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false
-    }
-}
\ No newline at end of file
diff --git a/mllm/flamingo/flamingo-mpt-30B-bf16.json b/mllm/flamingo/flamingo-mpt-30B-bf16.json
deleted file mode 100644
index b91d30c3b4a429f06eb135444aa5379a57dfdebf..0000000000000000000000000000000000000000
--- a/mllm/flamingo/flamingo-mpt-30B-bf16.json
+++ /dev/null
@@ -1,195 +0,0 @@
-{
-    "_commit_hash": null,
-    "architectures": [
-        "FlamingoForConditionalGeneration"
-    ],
-    "cross_attn_every_n_layers": 7,
-    "model_type": "flamingo",
-    "text_config": {
-        "_name_or_path": "",
-        "add_cross_attention": false,
-        "architectures": [
-            "MPTForCausalLM"
-        ],
-        "attn_config": {
-            "alibi": true,
-            "alibi_bias_max": 8,
-            "attn_impl": "torch",
-            "attn_pdrop": 0,
-            "attn_type": "multihead_attention",
-            "attn_uses_sequence_id": false,
-            "clip_qkv": null,
-            "prefix_lm": false,
-            "qk_ln": false,
-            "softmax_scale": null
-        },
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "d_model": 7168,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "emb_pdrop": 0,
-        "embedding_fraction": 1.0,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "expansion_ratio": 4,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_size": 7168,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "init_config": {
-            "emb_init_std": null,
-            "emb_init_uniform_lim": null,
-            "fan_mode": "fan_in",
-            "init_div_is_residual": true,
-            "init_gain": 0.0,
-            "init_nonlinearity": "relu",
-            "init_std": null,
-            "name": "kaiming_normal_",
-            "verbose": 0
-        },
-        "init_device": "cpu",
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "learned_pos_emb": true,
-        "length_penalty": 1.0,
-        "logit_scale": null,
-        "max_length": 20,
-        "max_seq_len": 8192,
-        "min_length": 0,
-        "model_type": "mpt",
-        "n_heads": 64,
-        "n_layers": 48,
-        "no_bias": true,
-        "no_repeat_ngram_size": 0,
-        "norm_type": "low_precision_layernorm",
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "prefix": null,
-        "problem_type": null,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "resid_pdrop": 0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "tokenizer_name": "EleutherAI/gpt-neox-20b",
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": "bfloat16",
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false,
-        "use_cache": false,
-        "verbose": 0,
-        "vocab_size": 50432
-    },
-    "torch_dtype": "bfloat16",
-    "transformers_version": null,
-    "use_media_placement_augmentation": true,
-    "vision_config": {
-        "_name_or_path": "openai/clip-vit-large-patch14",
-        "add_cross_attention": false,
-        "architectures": null,
-        "attention_dropout": 0.0,
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_act": "quick_gelu",
-        "hidden_size": 1024,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "image_size": 224,
-        "initializer_factor": 1.0,
-        "initializer_range": 0.02,
-        "intermediate_size": 4096,
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "layer_norm_eps": 1e-05,
-        "length_penalty": 1.0,
-        "max_length": 20,
-        "min_length": 0,
-        "model_type": "clip_vision_model",
-        "no_repeat_ngram_size": 0,
-        "num_attention_heads": 16,
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_channels": 3,
-        "num_hidden_layers": 24,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "patch_size": 14,
-        "prefix": null,
-        "problem_type": null,
-        "projection_dim": 512,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": null,
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false
-    }
-}
\ No newline at end of file
diff --git a/mllm/flamingo/flamingo-mpt-30B.json b/mllm/flamingo/flamingo-mpt-30B.json
deleted file mode 100644
index 4678ba660d079b7cadbe0f3af0400b308d196bd8..0000000000000000000000000000000000000000
--- a/mllm/flamingo/flamingo-mpt-30B.json
+++ /dev/null
@@ -1,195 +0,0 @@
-{
-    "_commit_hash": null,
-    "architectures": [
-        "FlamingoForConditionalGeneration"
-    ],
-    "cross_attn_every_n_layers": 7,
-    "model_type": "flamingo",
-    "text_config": {
-        "_name_or_path": "",
-        "add_cross_attention": false,
-        "architectures": [
-            "MPTForCausalLM"
-        ],
-        "attn_config": {
-            "alibi": true,
-            "alibi_bias_max": 8,
-            "attn_impl": "torch",
-            "attn_pdrop": 0,
-            "attn_type": "multihead_attention",
-            "attn_uses_sequence_id": false,
-            "clip_qkv": null,
-            "prefix_lm": false,
-            "qk_ln": false,
-            "softmax_scale": null
-        },
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "d_model": 7168,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "emb_pdrop": 0,
-        "embedding_fraction": 1.0,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "expansion_ratio": 4,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_size": 7168,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "init_config": {
-            "emb_init_std": null,
-            "emb_init_uniform_lim": null,
-            "fan_mode": "fan_in",
-            "init_div_is_residual": true,
-            "init_gain": 0.0,
-            "init_nonlinearity": "relu",
-            "init_std": null,
-            "name": "kaiming_normal_",
-            "verbose": 0
-        },
-        "init_device": "cpu",
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "learned_pos_emb": true,
-        "length_penalty": 1.0,
-        "logit_scale": null,
-        "max_length": 20,
-        "max_seq_len": 8192,
-        "min_length": 0,
-        "model_type": "mpt",
-        "n_heads": 64,
-        "n_layers": 48,
-        "no_bias": true,
-        "no_repeat_ngram_size": 0,
-        "norm_type": "low_precision_layernorm",
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "prefix": null,
-        "problem_type": null,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "resid_pdrop": 0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "tokenizer_name": "EleutherAI/gpt-neox-20b",
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": "bfloat16",
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false,
-        "use_cache": false,
-        "verbose": 0,
-        "vocab_size": 50432
-    },
-    "torch_dtype": "float32",
-    "transformers_version": null,
-    "use_media_placement_augmentation": true,
-    "vision_config": {
-        "_name_or_path": "openai/clip-vit-large-patch14",
-        "add_cross_attention": false,
-        "architectures": null,
-        "attention_dropout": 0.0,
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_act": "quick_gelu",
-        "hidden_size": 1024,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "image_size": 224,
-        "initializer_factor": 1.0,
-        "initializer_range": 0.02,
-        "intermediate_size": 4096,
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "layer_norm_eps": 1e-05,
-        "length_penalty": 1.0,
-        "max_length": 20,
-        "min_length": 0,
-        "model_type": "clip_vision_model",
-        "no_repeat_ngram_size": 0,
-        "num_attention_heads": 16,
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_channels": 3,
-        "num_hidden_layers": 24,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "patch_size": 14,
-        "prefix": null,
-        "problem_type": null,
-        "projection_dim": 512,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": null,
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false
-    }
-}
\ No newline at end of file
diff --git a/mllm/flamingo/flamingo-mpt-7B.json b/mllm/flamingo/flamingo-mpt-7B.json
deleted file mode 100644
index 9e1b681e921a9f657db9e1316b35d3ee270b6445..0000000000000000000000000000000000000000
--- a/mllm/flamingo/flamingo-mpt-7B.json
+++ /dev/null
@@ -1,195 +0,0 @@
-{
-    "_commit_hash": null,
-    "architectures": [
-        "FlamingoForConditionalGeneration"
-    ],
-    "cross_attn_every_n_layers": 4,
-    "model_type": "flamingo",
-    "text_config": {
-        "_name_or_path": "",
-        "add_cross_attention": false,
-        "architectures": [
-            "MPTForCausalLM"
-        ],
-        "attn_config": {
-            "alibi": true,
-            "alibi_bias_max": 8,
-            "attn_impl": "torch",
-            "attn_pdrop": 0,
-            "attn_type": "multihead_attention",
-            "attn_uses_sequence_id": false,
-            "clip_qkv": null,
-            "prefix_lm": false,
-            "qk_ln": false,
-            "softmax_scale": null
-        },
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "d_model": 4096,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "emb_pdrop": 0,
-        "embedding_fraction": 1.0,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "expansion_ratio": 4,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_size": 4096,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "init_config": {
-            "emb_init_std": null,
-            "emb_init_uniform_lim": null,
-            "fan_mode": "fan_in",
-            "init_div_is_residual": true,
-            "init_gain": 0,
-            "init_nonlinearity": "relu",
-            "init_std": 0.02,
-            "name": "kaiming_normal_",
-            "verbose": 0
-        },
-        "init_device": "cpu",
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "learned_pos_emb": true,
-        "length_penalty": 1.0,
-        "logit_scale": null,
-        "max_length": 20,
-        "max_seq_len": 2048,
-        "min_length": 0,
-        "model_type": "mpt",
-        "n_heads": 32,
-        "n_layers": 32,
-        "no_bias": true,
-        "no_repeat_ngram_size": 0,
-        "norm_type": "low_precision_layernorm",
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "prefix": null,
-        "problem_type": null,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "resid_pdrop": 0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "tokenizer_name": "EleutherAI/gpt-neox-20b",
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": "bfloat16",
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false,
-        "use_cache": false,
-        "verbose": 0,
-        "vocab_size": 50432
-    },
-    "torch_dtype": "float32",
-    "transformers_version": null,
-    "use_media_placement_augmentation": true,
-    "vision_config": {
-        "_name_or_path": "openai/clip-vit-large-patch14",
-        "add_cross_attention": false,
-        "architectures": null,
-        "attention_dropout": 0.0,
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_act": "quick_gelu",
-        "hidden_size": 1024,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "image_size": 224,
-        "initializer_factor": 1.0,
-        "initializer_range": 0.02,
-        "intermediate_size": 4096,
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "layer_norm_eps": 1e-05,
-        "length_penalty": 1.0,
-        "max_length": 20,
-        "min_length": 0,
-        "model_type": "clip_vision_model",
-        "no_repeat_ngram_size": 0,
-        "num_attention_heads": 16,
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_channels": 3,
-        "num_hidden_layers": 24,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "patch_size": 14,
-        "prefix": null,
-        "problem_type": null,
-        "projection_dim": 512,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": null,
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false
-    }
-}
\ No newline at end of file
diff --git a/mllm/flamingo/flamingo-vicuna-33B-v1.3.json b/mllm/flamingo/flamingo-vicuna-33B-v1.3.json
deleted file mode 100644
index 593706c96198c8b52c25113f9278fa0161186bd0..0000000000000000000000000000000000000000
--- a/mllm/flamingo/flamingo-vicuna-33B-v1.3.json
+++ /dev/null
@@ -1,111 +0,0 @@
-{
-    "_commit_hash": null,
-    "architectures": [
-        "FlamingoForConditionalGeneration"
-    ],
-    "cross_attn_every_n_layers": 4,
-    "model_type": "flamingo",
-    "text_config": {
-        "_name_or_path": "/home/luodian/projects/checkpoints/vicuna-33b-v1.3",
-        "architectures": [
-            "LlamaForCausalLM"
-        ],
-        "bos_token_id": 1,
-        "eos_token_id": 2,
-        "hidden_act": "silu",
-        "hidden_size": 6656,
-        "initializer_range": 0.02,
-        "intermediate_size": 17920,
-        "max_position_embeddings": 2048,
-        "model_type": "llama",
-        "num_attention_heads": 52,
-        "num_hidden_layers": 60,
-        "pad_token_id": 0,
-        "rms_norm_eps": 1e-06,
-        "tie_word_embeddings": false,
-        "torch_dtype": "float16",
-        "transformers_version": "4.28.1",
-        "use_cache": false,
-        "vocab_size": 32000
-    },
-    "torch_dtype": "float32",
-    "transformers_version": null,
-    "use_media_placement_augmentation": true,
-    "vision_config": {
-        "_name_or_path": "openai/clip-vit-large-patch14",
-        "add_cross_attention": false,
-        "architectures": null,
-        "attention_dropout": 0.0,
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_act": "quick_gelu",
-        "hidden_size": 1024,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "image_size": 224,
-        "initializer_factor": 1.0,
-        "initializer_range": 0.02,
-        "intermediate_size": 4096,
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "layer_norm_eps": 1e-05,
-        "length_penalty": 1.0,
-        "max_length": 20,
-        "min_length": 0,
-        "model_type": "clip_vision_model",
-        "no_repeat_ngram_size": 0,
-        "num_attention_heads": 16,
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_channels": 3,
-        "num_hidden_layers": 24,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "patch_size": 14,
-        "prefix": null,
-        "problem_type": null,
-        "projection_dim": 512,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": null,
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false
-    }
-}
\ No newline at end of file
diff --git a/mllm/flamingo/flamingo-vicuna-7B-v1.3.json b/mllm/flamingo/flamingo-vicuna-7B-v1.3.json
deleted file mode 100644
index 1e8ead8a8bcccba44b4796989924a2b291376510..0000000000000000000000000000000000000000
--- a/mllm/flamingo/flamingo-vicuna-7B-v1.3.json
+++ /dev/null
@@ -1,111 +0,0 @@
-{
-    "_commit_hash": null,
-    "architectures": [
-        "FlamingoForConditionalGeneration"
-    ],
-    "cross_attn_every_n_layers": 4,
-    "model_type": "flamingo",
-    "text_config": {
-        "_name_or_path": "/mnt/petrelfs/share_data/zhangyuanhan/vicuna-7b-v1.3",
-        "architectures": [
-            "LlamaForCausalLM"
-        ],
-        "bos_token_id": 1,
-        "eos_token_id": 2,
-        "hidden_act": "silu",
-        "hidden_size": 4096,
-        "initializer_range": 0.02,
-        "intermediate_size": 11008,
-        "max_position_embeddings": 2048,
-        "model_type": "llama",
-        "num_attention_heads": 32,
-        "num_hidden_layers": 32,
-        "pad_token_id": 0,
-        "rms_norm_eps": 1e-06,
-        "tie_word_embeddings": false,
-        "torch_dtype": "float16",
-        "transformers_version": "4.28.1",
-        "use_cache": false,
-        "vocab_size": 32000
-    },
-    "torch_dtype": "float32",
-    "transformers_version": null,
-    "use_media_placement_augmentation": true,
-    "vision_config": {
-        "_name_or_path": "openai/clip-vit-large-patch14",
-        "add_cross_attention": false,
-        "architectures": null,
-        "attention_dropout": 0.0,
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_act": "quick_gelu",
-        "hidden_size": 1024,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "image_size": 224,
-        "initializer_factor": 1.0,
-        "initializer_range": 0.02,
-        "intermediate_size": 4096,
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "layer_norm_eps": 1e-05,
-        "length_penalty": 1.0,
-        "max_length": 20,
-        "min_length": 0,
-        "model_type": "clip_vision_model",
-        "no_repeat_ngram_size": 0,
-        "num_attention_heads": 16,
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_channels": 3,
-        "num_hidden_layers": 24,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "patch_size": 14,
-        "prefix": null,
-        "problem_type": null,
-        "projection_dim": 512,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": null,
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false
-    }
-}
\ No newline at end of file
diff --git a/mllm/flamingo/injecting_falcon_into_flamingo.py b/mllm/flamingo/injecting_falcon_into_flamingo.py
deleted file mode 100644
index 997b397941b5fa032ffa1730e7c508258dcbf8ee..0000000000000000000000000000000000000000
--- a/mllm/flamingo/injecting_falcon_into_flamingo.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import os
-import torch
-from .configuration_flamingo import FlamingoConfig
-from .modeling_flamingo import FlamingoForConditionalGeneration
-
-root_dir = os.environ["AZP"]
-print(root_dir)
-
-
-config = FlamingoConfig.from_json_file(".flamingo-falcon-7B.json")
-model = FlamingoForConditionalGeneration(config=config)
-
-
-state_dict_files = [
-    f"{root_dir}/otter/checkpoints/falcon-7b/pytorch_model-00001-of-00002.bin",
-    f"{root_dir}/otter/checkpoints/falcon-7b/pytorch_model-00002-of-00002.bin",
-]
-
-state_dict = {}
-for file in state_dict_files:
-    state_dict_part = torch.load(file, map_location="cpu")
-    state_dict.update(state_dict_part)
-
-
-state_dict_3 = torch.load("{root_dir}/otter/checkpoints/flamingo_9b_hf/pytorch_model-00004-of-00004.bin", map_location="cpu")
-for cur_key in list(state_dict_3.keys()):
-    if "vision_encoder" not in cur_key:
-        del state_dict_3[cur_key]
-
-_ = model.load_state_dict(
-    state_dict_3,
-    False,
-)
-print(_[1])
-
-save_state_dict_1 = {}
-for key in state_dict:
-    if ".h." in key:
-        _, _, layer_num, *remain_names = key.split(".")
-        target_key = f"transformer.h.{layer_num}.decoder_layer.{'.'.join(remain_names)}"
-    else:
-        target_key = key
-    save_state_dict_1[f"{target_key}"] = state_dict[key]
-_ = model.lang_encoder.load_state_dict(
-    save_state_dict_1,
-    False,
-)
-print(_[1])
-model.save_pretrained(f"{root_dir}/otter/checkpoints/flamingo-falcon-7b/")
diff --git a/mllm/flamingo/injecting_llama2_into_flamingo.py b/mllm/flamingo/injecting_llama2_into_flamingo.py
deleted file mode 100644
index 19d368a1cf33b9f38ffcfe0c1bfac5471a20e77b..0000000000000000000000000000000000000000
--- a/mllm/flamingo/injecting_llama2_into_flamingo.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import argparse
-import os
-
-import torch
-from tqdm import tqdm
-
-import sys
-
-from .configuration_flamingo import FlamingoConfig
-from .modeling_flamingo import FlamingoForConditionalGeneration
-
-# from .configuration_flamingo import FlamingoConfig
-# from .modeling_flamingo import FlamingoForConditionalGeneration
-
-parser = argparse.ArgumentParser(description="Convert Vicuna model")
-parser.add_argument("--model_choice", type=str, default="13B", help="Choose either '7B' or '13B'")
-parser.add_argument("--llama2_root_dir", type=str, default="/home/luodian/projects/checkpoints")
-parser.add_argument("--save_root_dir", type=str, default="/home/luodian/projects/checkpoints")
-args = parser.parse_args()
-
-# os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-root_dir = args.llama2_root_dir
-model_choice = args.model_choice
-save_root_dir = args.save_root_dir
-
-# prepare vicuna model at first
-# you can visit https://huggingface.co/lmsys/Llama-2-33b-chat-hf to download 7B and 30B instruct checkpoints.
-if model_choice == "7B":
-    config_file = "./flamingo/flamingo-llama2-chat-7B.json"
-    state_dict_files = [
-        f"{root_dir}/Llama-2-7b-chat-hf/pytorch_model-00001-of-00002.bin",
-        f"{root_dir}/Llama-2-7b-chat-hf/pytorch_model-00002-of-00002.bin",
-    ]
-    save_path = f"{save_root_dir}/flamingo-llama2-chat-7B-init"
-elif model_choice == "13B":
-    config_file = "./flamingo/flamingo-llama2-chat-13B.json"
-    state_dict_files = [
-        f"{root_dir}/Llama-2-13b-chat-hf/pytorch_model-00001-of-00003.bin",
-        f"{root_dir}/Llama-2-13b-chat-hf/pytorch_model-00002-of-00003.bin",
-        f"{root_dir}/Llama-2-13b-chat-hf/pytorch_model-00003-of-00003.bin",
-    ]
-    save_path = f"{save_root_dir}/flamingo-llama2-chat-13B-init"
-else:
-    raise ValueError("Invalid model_choice. Choose either '13B' or '7B'.")
-
-config = FlamingoConfig.from_json_file(config_file)
-model = FlamingoForConditionalGeneration(config=config)
-
-# load flamingo's vision encoder from last checkpoint.
-# you can visit https://huggingface.co/luodian/openflamingo-9b-hf/tree/main to download the checkpoint.
-# AZP = "os.environ["AZP"]"
-AZP = os.environ["AZP"]
-state_dict_3 = torch.load(f"{AZP}/otter/checkpoints/flamingo_9b_hf/pytorch_model-00004-of-00004.bin", map_location="cpu")
-for cur_key in list(state_dict_3.keys()):
-    if "vision_encoder" not in cur_key:
-        del state_dict_3[cur_key]
-
-load_msg = model.load_state_dict(
-    state_dict_3,
-    False,
-)
-# print incompatible keys
-print(load_msg[1])
-
-# Loading vicuna weights
-state_dict = {}
-for file in tqdm(state_dict_files, desc="Loading state dict"):
-    state_dict_part = torch.load(file, map_location="cpu")
-    state_dict.update(state_dict_part)
-
-save_state_dict_1 = {}
-for key in state_dict:
-    if ".layers." in key:
-        _, _, layer_num, *remain_names = key.split(".")
-        target_key = f"model.layers.{layer_num}.decoder_layer.{'.'.join(remain_names)}"
-    else:
-        target_key = key
-    save_state_dict_1[f"{target_key}"] = state_dict[key]
-
-# Reshape the token embedding to 50280 for compatible
-model.lang_encoder.resize_token_embeddings(32000)
-
-load_msg = model.lang_encoder.load_state_dict(
-    save_state_dict_1,
-    False,
-)
-# Reshape the token embedding to 32002 for compatible
-model.lang_encoder.resize_token_embeddings(32002)
-# print incompatible keys
-print(load_msg[1])
-
-
-print(f"Saving model to {save_path}...")
-model.save_pretrained(save_path, max_shard_size="10GB")
diff --git a/mllm/flamingo/injecting_mpt-1B-redpajama_into_flamingo.py b/mllm/flamingo/injecting_mpt-1B-redpajama_into_flamingo.py
deleted file mode 100644
index 614e5055b9c52a88c30640cd3e79cb2244954be1..0000000000000000000000000000000000000000
--- a/mllm/flamingo/injecting_mpt-1B-redpajama_into_flamingo.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import argparse
-import os
-
-import torch
-from tqdm import tqdm
-
-import sys
-
-from configuration_flamingo import FlamingoConfig
-from modeling_flamingo import FlamingoForConditionalGeneration
-from utils import rename_flamingo_checkpoint
-
-
-parser = argparse.ArgumentParser(description="Convert MPT model")
-parser.add_argument("--mpt_root_dir", type=str, default="/home/luodian/projects/checkpoints")
-parser.add_argument("--save_root_dir", type=str, default="/home/luodian/projects/checkpoints")
-parser.add_argument("--flamingo_dir", type=str, default=None, help="If the pretrained flamingo weights also need to be injected")
-args = parser.parse_args()
-
-
-root_dir = args.mpt_root_dir
-save_root_dir = args.save_root_dir
-
-# prepare mpt model at first
-# you can visit https://huggingface.co/mosaicml to download 7B and 30B instruct checkpoints.
-config_file = "./flamingo/flamingo-mpt-1B-redpajama.json"
-state_dict_file = f"{root_dir}/pytorch_model.bin"
-save_path = f"{save_root_dir}/flamingo-mpt-1b-redpajama-200b-dolly"
-
-config = FlamingoConfig.from_json_file(config_file)
-
-model = FlamingoForConditionalGeneration(config=config)
-
-# Loading mpt weights
-state_dict = torch.load(state_dict_file, map_location="cpu")
-save_state_dict_1 = {}
-for key in state_dict:
-    if ".blocks." in key:
-        _, _, layer_num, *remain_names = key.split(".")
-        target_key = f"transformer.blocks.{layer_num}.decoder_layer.{'.'.join(remain_names)}"
-    else:
-        target_key = key
-    save_state_dict_1[f"{target_key}"] = state_dict[key]
-
-load_msg = model.lang_encoder.load_state_dict(
-    save_state_dict_1,
-    False,
-)
-
-# load flamingo's vision encoder from last checkpoint.
-# you can visit https://huggingface.co/luodian/openflamingo-9b-hf/tree/main to download the checkpoint.
-AZP = os.environ["AZP"]
-state_dict_3 = torch.load(f"{AZP}/pytorch_model-00004-of-00004.bin", map_location="cpu")
-for cur_key in list(state_dict_3.keys()):
-    if "vision_encoder" not in cur_key:
-        del state_dict_3[cur_key]
-
-load_msg = model.load_state_dict(
-    state_dict_3,
-    False,
-)
-# print incompatible keys
-print(load_msg[1])
-
-save_state_dict_1 = {}
-for key in state_dict:
-    if ".blocks." in key:
-        _, _, layer_num, *remain_names = key.split(".")
-        target_key = f"transformer.blocks.{layer_num}.decoder_layer.{'.'.join(remain_names)}"
-    else:
-        target_key = key
-    save_state_dict_1[f"{target_key}"] = state_dict[key]
-
-load_msg = model.lang_encoder.load_state_dict(
-    save_state_dict_1,
-    False,
-)
-# print incompatible keys
-print(load_msg[1])
-if args.flamingo_dir is not None:
-    state_dict_2 = torch.load(f"{args.flamingo_dir}/checkpoint.pt", map_location="cpu")
-    save_state_dict_2 = rename_flamingo_checkpoint(state_dict_2)
-    real_vocab_size = config.text_config.vocab_size
-    # Reshape the token embedding to 50280 for compatible
-    model.lang_encoder.resize_token_embeddings(save_state_dict_2["lang_encoder.transformer.wte.weight"].shape[0])
-
-    load_msg = model.load_state_dict(
-        save_state_dict_2,
-        False,
-    )
-    # print incompatible keys
-    print(load_msg[1])
-    # Reshape the token embedding to 50432
-    model.lang_encoder.resize_token_embeddings(real_vocab_size)
-
-print(f"Saving model to {save_path}...")
-model.save_pretrained(save_path, max_shard_size="10GB")
diff --git a/mllm/flamingo/injecting_mpt_into_flamingo.py b/mllm/flamingo/injecting_mpt_into_flamingo.py
deleted file mode 100644
index 4684a81856d94969b25ce060db31c1597d44c630..0000000000000000000000000000000000000000
--- a/mllm/flamingo/injecting_mpt_into_flamingo.py
+++ /dev/null
@@ -1,109 +0,0 @@
-import argparse
-import os
-
-import torch
-from tqdm import tqdm
-
-import sys
-
-from configuration_flamingo import FlamingoConfig
-from modeling_flamingo import FlamingoForConditionalGeneration
-from utils import rename_flamingo_checkpoint
-
-parser = argparse.ArgumentParser(description="Convert MPT model")
-parser.add_argument("--model_choice", type=str, choices=["7B", "30B"], required=True, help="Choose either '7B' or '30B'")
-parser.add_argument("--mpt_root_dir", type=str, default="/home/luodian/projects/checkpoints")
-parser.add_argument("--save_root_dir", type=str, default="/home/luodian/projects/checkpoints")
-parser.add_argument("--flamingo_dir", type=str, default=None, help="If the pretrained flamingo weights also need to be injected")
-args = parser.parse_args()
-
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-root_dir = args.mpt_root_dir
-model_choice = args.model_choice
-save_root_dir = args.save_root_dir
-
-# prepare mpt model at first
-# you can visit https://huggingface.co/mosaicml to download 7B and 30B instruct checkpoints.
-if model_choice == "30B":
-    config_file = "./flamingo/flamingo-mpt-30B.json"
-    state_dict_files = [
-        f"{root_dir}/mpt-30b-instruct/pytorch_model-00001-of-00007.bin",
-        f"{root_dir}/mpt-30b-instruct/pytorch_model-00002-of-00007.bin",
-        f"{root_dir}/mpt-30b-instruct/pytorch_model-00003-of-00007.bin",
-        f"{root_dir}/mpt-30b-instruct/pytorch_model-00004-of-00007.bin",
-        f"{root_dir}/mpt-30b-instruct/pytorch_model-00005-of-00007.bin",
-        f"{root_dir}/mpt-30b-instruct/pytorch_model-00006-of-00007.bin",
-        f"{root_dir}/mpt-30b-instruct/pytorch_model-00007-of-00007.bin",
-    ]
-    save_path = f"{save_root_dir}/flamingo-mpt-30B-instruct-init"
-elif model_choice == "7B":
-    config_file = "./flamingo/flamingo-mpt-7B.json"
-    state_dict_files = [
-        f"{root_dir}/mpt-7b/pytorch_model-00001-of-00002.bin",
-        f"{root_dir}/mpt-7b/pytorch_model-00002-of-00002.bin",
-    ]
-    save_path = f"{save_root_dir}/flamingo-mpt-7B"
-else:
-    raise ValueError("Invalid model_choice. Choose either '30B' or '7B'.")
-
-config = FlamingoConfig.from_json_file(config_file)
-
-model = FlamingoForConditionalGeneration(config=config)
-
-
-# load flamingo's vision encoder from last checkpoint.
-# you can visit https://huggingface.co/luodian/openflamingo-9b-hf/tree/main to download the checkpoint.
-AZP = os.environ["AZP"]
-state_dict_3 = torch.load(f"{AZP}/otter/checkpoints/flamingo_9b_hf/pytorch_model-00004-of-00004.bin", map_location="cpu")
-for cur_key in list(state_dict_3.keys()):
-    if "vision_encoder" not in cur_key:
-        del state_dict_3[cur_key]
-
-load_msg = model.load_state_dict(
-    state_dict_3,
-    False,
-)
-# print incompatible keys
-print(load_msg[1])
-
-# Loading mpt weights
-state_dict = {}
-for file in tqdm(state_dict_files, desc="Loading state dict"):
-    state_dict_part = torch.load(file, map_location="cpu")
-    state_dict.update(state_dict_part)
-
-save_state_dict_1 = {}
-for key in state_dict:
-    if ".blocks." in key:
-        _, _, layer_num, *remain_names = key.split(".")
-        target_key = f"transformer.blocks.{layer_num}.decoder_layer.{'.'.join(remain_names)}"
-    else:
-        target_key = key
-    save_state_dict_1[f"{target_key}"] = state_dict[key]
-
-load_msg = model.lang_encoder.load_state_dict(
-    save_state_dict_1,
-    False,
-)
-# print incompatible keys
-print(load_msg[1])
-if args.flamingo_dir is not None:
-    state_dict_2 = torch.load(f"{args.flamingo_dir}/checkpoint.pt", map_location="cpu")
-    save_state_dict_2 = rename_flamingo_checkpoint(state_dict_2)
-
-    real_vocab_size = config.text_config.vocab_size
-    # Reshape the token embedding to 50280 for compatible
-    model.lang_encoder.resize_token_embeddings(save_state_dict_2["lang_encoder.transformer.wte.weight"].shape[0])
-
-    load_msg = model.load_state_dict(
-        save_state_dict_2,
-        False,
-    )
-    # print incompatible keys
-    print(load_msg[1])
-    # Reshape the token embedding to 50432
-    model.lang_encoder.resize_token_embeddings(real_vocab_size)
-
-print(f"Saving model to {save_path}...")
-model.save_pretrained(save_path, max_shard_size="10GB")
diff --git a/mllm/flamingo/injecting_vicuna_into_flamingo.py b/mllm/flamingo/injecting_vicuna_into_flamingo.py
deleted file mode 100644
index 95cf5f3210fbdc7df2710fdee3de392ba03fdaee..0000000000000000000000000000000000000000
--- a/mllm/flamingo/injecting_vicuna_into_flamingo.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import argparse
-import os
-
-import torch
-from tqdm import tqdm
-
-import sys
-
-from .configuration_flamingo import FlamingoConfig
-from .modeling_flamingo import FlamingoForConditionalGeneration
-
-# from .configuration_flamingo import FlamingoConfig
-# from .modeling_flamingo import FlamingoForConditionalGeneration
-
-parser = argparse.ArgumentParser(description="Convert Vicuna model")
-parser.add_argument("--model_choice", type=str, choices=["7B", "33B"], required=True, help="Choose either '7B' or '33B'")
-parser.add_argument("--vicuna_root_dir", type=str, default="/home/luodian/projects/checkpoints")
-parser.add_argument("--save_root_dir", type=str, default="/home/luodian/projects/checkpoints")
-parser.add_argument("--flamingo_dir", type=str, default=None, help="If the pretrained flamingo weights also need to be injected")
-args = parser.parse_args()
-
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-root_dir = args.vicuna_root_dir
-model_choice = args.model_choice
-save_root_dir = args.save_root_dir
-
-# prepare vicuna model at first
-# you can visit https://huggingface.co/lmsys/vicuna-33b-v1.3 to download 7B and 30B instruct checkpoints.
-if model_choice == "33B":
-    config_file = "./flamingo/flamingo-vicuna-33B-v1.3.json"
-    state_dict_files = [
-        f"{root_dir}/vicuna-33b-v1.3/pytorch_model-00001-of-00007.bin",
-        f"{root_dir}/vicuna-33b-v1.3/pytorch_model-00002-of-00007.bin",
-        f"{root_dir}/vicuna-33b-v1.3/pytorch_model-00003-of-00007.bin",
-        f"{root_dir}/vicuna-33b-v1.3/pytorch_model-00004-of-00007.bin",
-        f"{root_dir}/vicuna-33b-v1.3/pytorch_model-00005-of-00007.bin",
-        f"{root_dir}/vicuna-33b-v1.3/pytorch_model-00006-of-00007.bin",
-        f"{root_dir}/vicuna-33b-v1.3/pytorch_model-00007-of-00007.bin",
-    ]
-    save_path = f"{save_root_dir}/flamingo-vicuna-33B-v1.3-init"
-elif model_choice == "7B":
-    config_file = "./flamingo/flamingo-vicuna-7B-v1.3.json"
-    state_dict_files = [
-        f"{root_dir}/vicuna-7b-v1.3/pytorch_model-00001-of-00002.bin",
-        f"{root_dir}/vicuna-7b-v1.3/pytorch_model-00002-of-00002.bin",
-    ]
-    save_path = f"{save_root_dir}/flamingo-vicuna-7B-v1.3-init"
-else:
-    raise ValueError("Invalid model_choice. Choose either '33B' or '7B'.")
-
-config = FlamingoConfig.from_json_file(config_file)
-model = FlamingoForConditionalGeneration(config=config)
-
-# load flamingo's vision encoder from last checkpoint.
-# you can visit https://huggingface.co/luodian/openflamingo-9b-hf/tree/main to download the checkpoint.
-# AZP = "os.environ["AZP"]"
-AZP = os.environ["AZP"]
-state_dict_3 = torch.load(f"{AZP}/otter/checkpoints/flamingo_9b_hf/pytorch_model-00004-of-00004.bin", map_location="cpu")
-for cur_key in list(state_dict_3.keys()):
-    if "vision_encoder" not in cur_key:
-        del state_dict_3[cur_key]
-
-load_msg = model.load_state_dict(
-    state_dict_3,
-    False,
-)
-# print incompatible keys
-print(load_msg[1])
-
-# Loading vicuna weights
-state_dict = {}
-for file in tqdm(state_dict_files, desc="Loading state dict"):
-    state_dict_part = torch.load(file, map_location="cpu")
-    state_dict.update(state_dict_part)
-
-save_state_dict_1 = {}
-for key in state_dict:
-    if ".layers." in key:
-        _, _, layer_num, *remain_names = key.split(".")
-        target_key = f"model.layers.{layer_num}.decoder_layer.{'.'.join(remain_names)}"
-    else:
-        target_key = key
-    save_state_dict_1[f"{target_key}"] = state_dict[key]
-
-# Reshape the token embedding to 50280 for compatible
-model.lang_encoder.resize_token_embeddings(32000)
-
-load_msg = model.lang_encoder.load_state_dict(
-    save_state_dict_1,
-    False,
-)
-# Reshape the token embedding to 32002 for compatible
-model.lang_encoder.resize_token_embeddings(32002)
-# print incompatible keys
-print(load_msg[1])
-
-
-print(f"Saving model to {save_path}...")
-model.save_pretrained(save_path, max_shard_size="10GB")
diff --git a/mllm/flamingo/modeling_flamingo.py b/mllm/flamingo/modeling_flamingo.py
deleted file mode 100644
index 26f6e5284abdb2ea37b5d769e9665be247aaf8c2..0000000000000000000000000000000000000000
--- a/mllm/flamingo/modeling_flamingo.py
+++ /dev/null
@@ -1,966 +0,0 @@
-import random
-from dataclasses import dataclass
-from typing import Callable, Optional
-
-import torch
-import torch.nn as nn
-from accelerate.hooks import AlignDevicesHook, add_hook_to_module
-from einops import rearrange, repeat
-from transformers import CLIPVisionModel, LlamaForCausalLM, LlamaTokenizer
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.models.auto import AutoModel, AutoModelForCausalLM, AutoTokenizer
-
-from .configuration_flamingo import FlamingoConfig
-from .falcon.modelling_RW import RWForCausalLM
-from .mpt.modeling_mpt import MPTForCausalLM
-from .mpt_redpajama.mosaic_gpt import MosaicGPT
-
-# from .configuration_flamingo import FlamingoConfig
-
-__KNOWN_DECODER_LAYERS_ATTR_NAMES = {
-    "opt": "model.decoder.layers",
-    "gptneo": "transformer.h",
-    "gptj": "transformer.h",
-    "gpt-j": "transformer.h",
-    "pythia": "gpt_neox.layers",
-    "llama": "model.layers",
-    "RWForCausalLM": "transformer.h",
-    "MPTForCausalLM": "transformer.blocks",
-    "MosaicGPT": "transformer.blocks",
-}
-
-
-def _infer_decoder_layers_attr_name(model: nn.Module):
-    for k in __KNOWN_DECODER_LAYERS_ATTR_NAMES:
-        if k.lower() in model.__class__.__name__.lower():
-            return __KNOWN_DECODER_LAYERS_ATTR_NAMES[k]
-
-    raise ValueError(
-        f"We require the attribute name for the nn.ModuleList in the decoder storing the transformer block layers. Please supply this string manually."
-    )
-
-
-def extend_instance(obj, mixin):
-    """Apply mixins to a class instance after creation"""
-    base_cls = obj.__class__
-    base_cls_name = obj.__class__.__name__
-    obj.__class__ = type(base_cls_name, (mixin, base_cls), {})  # mixin needs to go first for our forward() logic to work
-
-
-def getattr_recursive(obj, att):
-    """
-    Return nested attribute of obj
-    Example: getattr_recursive(obj, 'a.b.c') is equivalent to obj.a.b.c
-    """
-    if att == "":
-        return obj
-    i = att.find(".")
-    if i < 0:
-        return getattr(obj, att)
-    else:
-        return getattr_recursive(getattr(obj, att[:i]), att[i + 1 :])
-
-
-def setattr_recursive(obj, att, val):
-    """
-    Set nested attribute of obj
-    Example: setattr_recursive(obj, 'a.b.c', val) is equivalent to obj.a.b.c = val
-    """
-    if "." in att:
-        obj = getattr_recursive(obj, ".".join(att.split(".")[:-1]))
-    setattr(obj, att.split(".")[-1], val)
-
-
-def exists(val):
-    return val is not None
-
-
-class FlamingoPerceiverBlock(nn.Module):
-    def __init__(self, *, dim: int, dim_head: int = 64, heads: int = 8, mult: int = 4):
-        super().__init__()
-        self.scale = dim_head**-0.5
-        self.heads = heads
-        inner_dim = dim_head * heads
-        ff_dim = dim * mult
-        self.norm_media = nn.LayerNorm(dim)
-        self.norm_latents = nn.LayerNorm(dim)
-
-        self.to_q = nn.Linear(dim, inner_dim, bias=False)
-        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
-        self.to_out = nn.Linear(inner_dim, dim, bias=False)
-        self.feed_forward = nn.ModuleList(
-            [
-                nn.LayerNorm(dim),
-                nn.Linear(dim, ff_dim, bias=False),
-                nn.GELU(),
-                nn.Linear(ff_dim, dim, bias=False),
-            ]
-        )
-
-    def forward(self, x: torch.Tensor, latents: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            x (torch.Tensor): image features
-                shape (b, T, n1, D)
-            latent (torch.Tensor): latent features
-                shape (b, T, n2, D)
-        """
-        x = self.norm_media(x)
-        residual_latents = latents
-        latents = self.norm_latents(latents)
-
-        h = self.heads
-
-        q = self.to_q(latents)
-        kv_input = torch.cat((x, latents), dim=-2)
-        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
-        q = rearrange(q, "b t n (h d) -> b h t n d", h=h)
-        k = rearrange(k, "b t n (h d) -> b h t n d", h=h)
-        v = rearrange(v, "b t n (h d) -> b h t n d", h=h)
-        q = q * self.scale
-
-        # attention
-        sim = torch.einsum("... i d, ... j d  -> ... i j", q, k)
-        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
-        attn = sim.softmax(dim=-1)
-
-        out = torch.einsum("... i j, ... j d -> ... i d", attn, v)
-        out = rearrange(out, "b h t n d -> b t n (h d)", h=h)
-        out = self.to_out(out) + residual_latents
-        residual_out = out
-        for layer in self.feed_forward:
-            out = layer(out)
-        return out + residual_out
-
-
-class FlamingoPerceiverResampler(nn.Module):
-    def __init__(
-        self,
-        *,
-        dim: int,
-        depth: int = 6,
-        dim_head: int = 64,
-        heads: int = 8,
-        num_latents: int = 64,
-        # max_num_frames: int = 128,
-        max_num_media: Optional[int] = None,
-        max_num_frames: Optional[int] = None,
-        ff_mult: int = 4,
-    ):
-        super().__init__()
-        self.latents = nn.Parameter(torch.randn(num_latents, dim))
-        self.frame_embs = nn.Parameter(torch.randn(max_num_frames, dim)) if exists(max_num_frames) else None
-        # self.frame_embs = nn.Parameter(torch.randn(max_num_frames, dim))
-
-        self.media_time_embs = nn.Parameter(torch.randn(max_num_media, 1, dim)) if exists(max_num_media) else None
-
-        self.layers = nn.ModuleList([])
-        for _ in range(depth):
-            self.layers.append(FlamingoPerceiverBlock(dim=dim, dim_head=dim_head, heads=heads, mult=ff_mult))
-
-        self.norm = nn.LayerNorm(dim)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            x (torch.Tensor): image features
-                shape (b, T, F, v, D)
-        Returns:
-            shape (b, T, n, D) where n is self.num_latents
-        """
-        b, T, F, v = x.shape[:4]
-
-        # frame and media time embeddings
-        if exists(self.frame_embs):
-            frame_embs = repeat(self.frame_embs[:F], "F d -> b T F v d", b=b, T=T, v=v)
-            x = x + frame_embs
-        x = rearrange(x, "b T F v d -> b T (F v) d")  # flatten the frame and spatial dimensions
-        if exists(self.media_time_embs):
-            x = x + self.media_time_embs[:T]
-
-        # blocks
-        latents = repeat(self.latents, "n d -> b T n d", b=b, T=T)
-        for block in self.layers:
-            latents = block(x, latents)
-        return self.norm(latents)
-
-
-class FlamingoMaskedCrossAttention(nn.Module):
-    def __init__(
-        self,
-        *,
-        dim: int,
-        dim_visual: int,
-        dim_head: int = 64,
-        heads: int = 8,
-        only_attend_immediate_media: bool = True,
-    ):
-        super().__init__()
-        self.scale = dim_head**-0.5
-        self.heads = heads
-        inner_dim = dim_head * heads
-
-        self.norm = nn.LayerNorm(dim)
-
-        self.to_q = nn.Linear(dim, inner_dim, bias=False)
-        self.to_kv = nn.Linear(dim_visual, inner_dim * 2, bias=False)
-        self.to_out = nn.Linear(inner_dim, dim, bias=False)
-
-        # whether for text to only attend to immediate preceding image, or all previous images
-        self.only_attend_immediate_media = only_attend_immediate_media
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        media: torch.Tensor,
-        media_locations: Optional[torch.BoolTensor] = None,
-        attend_previous: bool = True,
-    ) -> torch.Tensor:
-        """
-        Args:
-            x (torch.Tensor): text features
-                shape (B, T_txt, D_txt)
-            media (torch.Tensor): image features
-                shape (B, T_img, n, D_img) where n is the dim of the latents
-            media_locations: boolean mask identifying the media tokens in x
-                shape (B, T_txt)
-            attend_previous: bool
-                If false, ignores immediately preceding image and starts attending when following image
-        """
-        _, T_img, n = media.shape[:3]
-        h = self.heads
-
-        x = self.norm(x)
-
-        q = self.to_q(x)
-        media = rearrange(media, "b t n d -> b (t n) d")
-
-        k, v = self.to_kv(media).chunk(2, dim=-1)
-        q = rearrange(q, "b n (h d) -> b h n d", h=h)
-        k = rearrange(k, "b n (h d) -> b h n d", h=h)
-        v = rearrange(v, "b n (h d) -> b h n d", h=h)
-
-        q = q * self.scale
-
-        sim = torch.einsum("... i d, ... j d -> ... i j", q, k)
-
-        if exists(media_locations):
-            # at each boolean of True, increment the time counter (relative to media time)
-            text_time = media_locations.cumsum(dim=-1)
-            media_time = torch.arange(T_img, device=x.device) + 1
-
-            if not attend_previous:
-                text_time[~media_locations] += 1
-                # make sure max is still the number of images in the sequence
-                text_time[
-                    text_time
-                    > repeat(
-                        torch.count_nonzero(media_locations, dim=1),
-                        "b -> b i",
-                        i=text_time.shape[1],
-                    )
-                ] = 0
-
-            # text time must equal media time if only attending to most immediate image
-            # otherwise, as long as text time is greater than media time (if attending to all previous images / media)
-            mask_op = torch.eq if self.only_attend_immediate_media else torch.ge
-
-            text_to_media_mask = mask_op(
-                rearrange(text_time, "b i -> b 1 i 1"),
-                repeat(media_time, "j -> 1 1 1 (j n)", n=n),
-            )
-            sim = sim.masked_fill(~text_to_media_mask, -torch.finfo(sim.dtype).max)
-
-        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
-        attn = sim.softmax(dim=-1)
-
-        if exists(media_locations) and self.only_attend_immediate_media:
-            # any text without a preceding media needs to have attention zeroed out
-            text_without_media_mask = text_time == 0
-            text_without_media_mask = rearrange(text_without_media_mask, "b i -> b 1 i 1")
-            attn = attn.masked_fill(text_without_media_mask, 0.0)
-
-        out = torch.einsum("... i j, ... j d -> ... i d", attn, v)
-        out = rearrange(out, "b h n d -> b n (h d)")
-        return self.to_out(out)
-
-
-class FlamingoGatedCrossAttentionBlock(nn.Module):
-    def __init__(
-        self,
-        *,
-        dim: int,
-        dim_visual: int,
-        dim_head: int = 64,
-        heads: int = 8,
-        ff_mult: int = 4,
-        only_attend_immediate_media: bool = True,
-    ):
-        super().__init__()
-        self.attn = FlamingoMaskedCrossAttention(
-            dim=dim,
-            dim_visual=dim_visual,
-            dim_head=dim_head,
-            heads=heads,
-            only_attend_immediate_media=only_attend_immediate_media,
-        )
-        self.attn_gate = nn.Parameter(torch.tensor([0.0]))
-        self.feed_forward = nn.ModuleList(
-            [
-                nn.LayerNorm(dim),
-                nn.Linear(dim, dim * ff_mult, bias=False),
-                nn.GELU(),
-                nn.Linear(dim * ff_mult, dim, bias=False),
-            ]
-        )
-        self.ff_gate = nn.Parameter(torch.tensor([0.0]))
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        media: torch.Tensor,
-        media_locations: Optional[torch.BoolTensor] = None,
-        attend_previous: bool = True,
-    ) -> torch.Tensor:
-        x = (
-            self.attn(
-                x,
-                media,
-                media_locations=media_locations,
-                attend_previous=attend_previous,
-            )
-            * self.attn_gate.tanh()
-            + x
-        )
-        residual_x = x
-        for ff in self.feed_forward:
-            x = ff(x)
-        x = x * self.ff_gate.tanh() + residual_x
-
-        return x
-
-
-class FlamingoLayer(nn.Module):
-    def __init__(self, gated_cross_attn_layer: nn.Module, decoder_layer: nn.Module):
-        super().__init__()
-        self.gated_cross_attn_layer = gated_cross_attn_layer
-        self.decoder_layer = decoder_layer
-        self.vis_x = None
-        self.media_locations = None
-
-    def is_conditioned(self) -> bool:
-        """Check whether the layer is conditioned."""
-        return self.vis_x is not None
-
-    # Used this great idea from this implementation of Flamingo (https://github.com/dhansmair/flamingo-mini/)
-    def condition_vis_x(self, vis_x) -> None:
-        self.vis_x = vis_x
-
-    def condition_media_locations(self, media_locations) -> None:
-        self.media_locations = media_locations
-
-    def condition_attend_previous(self, attend_previous) -> None:
-        self.attend_previous = attend_previous
-
-    def forward(
-        self,
-        lang_x: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        **decoder_layer_kwargs,
-    ):
-        if self.gated_cross_attn_layer is None:
-            return self.decoder_layer(lang_x, attention_mask=attention_mask, **decoder_layer_kwargs)
-
-        if self.vis_x is None:
-            raise ValueError("vis_x must be conditioned before forward pass")
-
-        if self.media_locations is None:
-            raise ValueError("media_locations must be conditioned before forward pass")
-
-        lang_x = self.gated_cross_attn_layer(
-            lang_x,
-            self.vis_x,
-            media_locations=self.media_locations,
-            attend_previous=self.attend_previous,
-        )
-        lang_x = self.decoder_layer(lang_x, attention_mask=attention_mask, **decoder_layer_kwargs)
-        return lang_x
-
-
-class FlamingoLMMixin(nn.Module):
-    """
-    Mixin to add cross-attention layers to a language model.
-    """
-
-    def set_decoder_layers_attr_name(self, decoder_layers_attr_name):
-        self.decoder_layers_attr_name = decoder_layers_attr_name
-
-    def _get_decoder_layers(self):
-        return getattr_recursive(self, self.decoder_layers_attr_name)
-
-    def _set_decoder_layers(self, value):
-        setattr_recursive(self, self.decoder_layers_attr_name, value)
-
-    def init_flamingo(
-        self,
-        media_token_id: int,
-        vis_hidden_size: int,
-        cross_attn_every_n_layers: int,
-        use_media_placement_augmentation: bool,
-    ):
-        """
-        Initialize Flamingo by adding a new gated cross attn to the decoder. Store the media token id for computing the media locations.
-        """
-
-        gated_cross_attn_layers = nn.ModuleList(
-            [
-                FlamingoGatedCrossAttentionBlock(
-                    dim=self.config.hidden_size,
-                    dim_visual=vis_hidden_size,
-                )
-                if (layer_idx + 1) % cross_attn_every_n_layers == 0
-                else None
-                for layer_idx, _ in enumerate(self._get_decoder_layers())
-            ]
-        )
-        self._set_decoder_layers(
-            nn.ModuleList(
-                [
-                    FlamingoLayer(gated_cross_attn_layer, decoder_layer)
-                    for gated_cross_attn_layer, decoder_layer in zip(gated_cross_attn_layers, self._get_decoder_layers())
-                ]
-            )
-        )
-        self.media_token_id = media_token_id
-        self.use_media_placement_augmentation = use_media_placement_augmentation
-        self.initialized_flamingo = True
-
-    def forward(self, *input, **kwargs):
-        """Condition the Flamingo layers on the media locations before forward()"""
-        if not self.initialized_flamingo:
-            raise ValueError("Flamingo layers are not initialized. Please call `init_flamingo` first.")
-
-        input_ids = kwargs["input_ids"] if "input_ids" in kwargs else input[0]
-        media_locations = input_ids == self.media_token_id
-        # IMPORTANT: Force `attend_previous` to True when we place training data as <image>caption<|endofchunk|>
-        # attend_previous = (
-        #     (random.random() < 0.5) if self.use_media_placement_augmentation else False
-        # )
-        attend_previous = (random.random() < 0.5) if self.use_media_placement_augmentation else True
-        # attend_previous = self.only_attend_previous
-
-        if self.__class__.__name__ == "LlamaForCausalLM":
-            for layer in self.get_decoder().layers:
-                layer.condition_media_locations(media_locations)
-                layer.condition_attend_previous(attend_previous)
-        elif self.__class__.__name__ in ["MPTForCausalLM", "MosaicGPT"]:
-            for layer in self.get_decoder().blocks:
-                layer.condition_media_locations(media_locations)
-                layer.condition_attend_previous(attend_previous)
-        else:
-            print("inavaliable text encoder")
-        return super().forward(*input, **kwargs)  # Call the other parent's forward method
-
-    def is_conditioned(self) -> bool:
-        """Check whether all decoder layers are already conditioned."""
-        return all(l.is_conditioned() for l in self._get_decoder_layers())
-
-    def clear_conditioned_layers(self) -> None:
-        for layer in self._get_decoder_layers():
-            layer.condition_vis_x(None)
-            layer.condition_media_locations(None)
-            layer.condition_attend_previous(None)
-
-
-class FlamingoPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = FlamingoConfig
-    base_model_prefix = "flamingo"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["FlamingoPerceiverBlock", "CLIPEncoderLayer", "FlamingoLayer"]
-
-    def _init_weights(self, module):
-        """Flamingo requires no specific initialization"""
-        return super()._init_weights(module)
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, FlamingoModel):
-            module.gradient_checkpointing = value
-
-
-class FlamingoModel(FlamingoPreTrainedModel):
-    config_class = FlamingoConfig
-
-    def __init__(
-        self,
-        config: FlamingoConfig,
-    ):
-        super().__init__(config)
-        ### TODO: give "LlamaForCausalLM" as the name of text_config.architectures of Llama_based flamingo
-        if "llama" not in config.text_config._name_or_path:
-            if config.text_config.architectures[0] == "MPTForCausalLM":
-                text_tokenizer = AutoTokenizer.from_pretrained("mosaicml/mpt-7b-instruct")
-                lang_encoder = MPTForCausalLM(config=config.text_config)
-            elif config.text_config.text_config.architectures[0] == "MosaicGPT":
-                text_tokenizer = AutoTokenizer.from_pretrained("mosaicml/mosaic-llama-redpajama-final-candidate")
-                lang_encoder = MosaicGPT(config=config.text_config)
-            elif config.text_config.architectures[0] == "RWForCausalLM":
-                text_tokenizer = AutoTokenizer.from_pretrained("PATH-TO-YOUR-FALCON")
-                lang_encoder = RWForCausalLM(config=config.text_config)
-        else:
-            text_tokenizer = LlamaTokenizer.from_pretrained(config.text_config._name_or_path)
-            lang_encoder = LlamaForCausalLM(config=config.text_config)
-
-        vision_encoder = CLIPVisionModel(config=config.vision_config)
-        text_tokenizer.add_special_tokens({"additional_special_tokens": ["<|endofchunk|>", "<image>"]})
-        if text_tokenizer.pad_token is None:
-            text_tokenizer.add_special_tokens({"pad_token": "<PAD>"})
-        self.text_tokenizer = text_tokenizer
-        self.eoc_token_id = text_tokenizer.encode("<|endofchunk|>")[-1]
-        self.media_token_id = text_tokenizer.encode("<image>")[-1]
-
-        extend_instance(lang_encoder, FlamingoLMMixin)
-        decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
-        lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
-        if lang_encoder.__class__.__name__ == "LlamaForCausalLM":
-            lang_encoder.resize_token_embeddings(len(text_tokenizer))
-        self.lang_encoder = lang_encoder
-
-        self.cross_attn_every_n_layers = config.cross_attn_every_n_layers if hasattr(config, "cross_attn_every_n_layers") else 4
-        self.use_media_placement_augmentation = config.use_media_placement_augmentation
-
-        vision_encoder.output_tokens = True
-        self.vision_encoder = vision_encoder
-
-        self.vis_dim = 1024
-        self.perceiver = FlamingoPerceiverResampler(dim=self.vis_dim)
-
-        self.lang_encoder.init_flamingo(
-            media_token_id=self.media_token_id,
-            vis_hidden_size=self.vis_dim,
-            cross_attn_every_n_layers=self.cross_attn_every_n_layers,
-            use_media_placement_augmentation=self.use_media_placement_augmentation,
-        )
-        self.post_init()
-
-    def get_input_embeddings(self) -> nn.Module:
-        return self.lang_encoder.get_input_embeddings()
-
-    def set_input_embeddings(self, new_embeddings):
-        self.lang_encoder.set_input_embeddings(new_embeddings)
-
-    def get_output_embeddings(self) -> nn.Module:
-        return self.lang_encoder.get_output_embeddings()
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lang_encoder.set_output_embeddings(new_embeddings)
-
-    def get_image_encoder(self) -> nn.Module:
-        return self.vision_encoder
-
-    def get_lang_encoder(self) -> nn.Module:
-        return self.lang_encoder
-
-    # def init_weights(self):
-    #     # Freeze all parameters in vision encoder
-    #     for param in self.vision_encoder.parameters():
-    #         param.requires_grad = False
-    #     # Freeze all parameters in lang encoders except gated_cross_attn_layers
-    #     for name, param in self.lang_encoder.named_parameters():
-    #         if "gated_cross_attn_layer" not in name:
-    #             param.requires_grad = False
-    #     # Unfreeze LM input embeddings
-    #     self.lang_encoder.get_input_embeddings().requires_grad_(True)
-    #     ## MPTForCausalLM is tied word embedding
-    #     if self.lang_encoder.__class__.__name__ == "LlamaForCausalLM":
-    #         self.lang_encoder.lm_head.requires_grad_(True)
-    #     # assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0
-    #     # print model size in billions of parameters in 2 decimal places
-    #     print(f"Trainable param: {(sum(p.numel() for p in self.parameters() if p.requires_grad)) / 1e9:.2f} B")
-
-    def init_weights(self):
-        # Freeze all parameters in vision encoder
-        for param in self.vision_encoder.parameters():
-            param.requires_grad = False
-
-        if "lora_config" in self.config.__dict__:
-            print(f"LoRA trainable param: {(sum(p.numel() for p in self.lang_encoder.parameters() if p.requires_grad)) / 1e9:.3f} B")
-            # Unfreeze gated_cross_attn_layers
-            for layer in self.lang_encoder._get_decoder_layers():
-                if layer.gated_cross_attn_layer is not None:
-                    for param in layer.gated_cross_attn_layer.parameters():
-                        param.requires_grad = True
-        else:
-            # Freeze all parameters in lang encoders except gated_cross_attn_layers
-            for name, param in self.lang_encoder.named_parameters():
-                if "gated_cross_attn_layer" not in name:
-                    param.requires_grad = False
-        # Unfreeze LM input and output embeddings
-        self.lang_encoder.get_input_embeddings().requires_grad_(True)
-        ## MPTForCausalLM is tied word embedding
-        if self.lang_encoder.__class__.__name__ == "LlamaForCausalLM":
-            self.lang_encoder.lm_head.requires_grad_(True)
-        # assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0
-        # print model size in billions of parameters in 2 decimal places
-        print(f"Total Trainable param: {(sum(p.numel() for p in self.parameters() if p.requires_grad)) / 1e9:.3f} B")
-
-    def forward(
-        self,
-        vision_x: torch.Tensor,
-        lang_x: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cached_vision_x: bool = False,
-        clear_conditioned_layers: bool = True,
-        past_key_values: Optional[torch.Tensor] = None,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> CausalLMOutputWithPast:
-        """
-        Forward pass of Flamingo.
-
-        Args:
-            vision_x (torch.Tensor): Vision input
-                shape (B, T_img, F, C, H, W) with F=1
-            lang_x (torch.Tensor): Language input ids
-                shape (B, T_txt)
-            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
-            labels (torch.Tensor, optional): Labels. Defaults to None.
-            clear_conditioned_layers: if True, clear the conditioned layers
-                once the foward pass is completed. Set this to false if the
-                same set of images will be reused in another subsequent
-                forward pass.
-            past_key_values: pre-computed values to pass to language model.
-                See past_key_values documentation in Hugging Face
-                CausalLM models.
-            use_cache: whether to use cached key values. See use_cache
-                documentation in Hugging Face CausalLM models.
-        """
-        assert (vision_x is not None) or use_cached_vision_x, "Must provide either vision_x or use_cached_vision_x to True."
-
-        if use_cached_vision_x:
-            # Case: use cached; vision_x should be cached and other
-            # vision-related inputs should not be provided.
-            assert vision_x is None, "Expect vision_x to be None when use_cached_vision_x is True."
-            assert self.lang_encoder.is_conditioned()
-
-        else:
-            # Case: do not use caching (i.e. this is a standard forward pass);
-            self._encode_vision_x(vision_x=vision_x)
-
-        output = self.lang_encoder(
-            input_ids=lang_x,
-            attention_mask=attention_mask,
-            labels=labels,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            **kwargs,
-        )
-
-        if clear_conditioned_layers:
-            self.lang_encoder.clear_conditioned_layers()
-
-        return output
-
-    def _encode_vision_x(self, vision_x: torch.Tensor):
-        """
-        Compute media tokens from vision input by passing it through vision encoder and conditioning language model.
-        Args:
-            vision_x (torch.Tensor): Vision input
-                shape (B, T_img, F, C, H, W)
-                Images in the same chunk are collated along T_img, and frames are collated along F
-                Currently only F=1 is supported (single-frame videos)
-
-        rearrange code based on https://github.com/dhansmair/flamingo-mini
-        """
-
-        assert vision_x.ndim == 6, "vision_x should be of shape (b, T_img, F, C, H, W)"
-        b, T, F = vision_x.shape[:3]
-        assert F == 1, "Only single frame supported"
-
-        vision_x = rearrange(vision_x, "b T F c h w -> (b T F) c h w")
-        with torch.no_grad():
-            vision_x = self.vision_encoder(vision_x)[0][:, 1:, :]
-        vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F)
-
-        vision_x = self.perceiver(vision_x)  # reshapes to (b, T, n, d)
-
-        for layer in self.lang_encoder._get_decoder_layers():
-            layer.condition_vis_x(vision_x)
-
-
-class FlamingoForConditionalGeneration(FlamingoPreTrainedModel):
-    config_class = FlamingoConfig
-
-    def __init__(
-        self,
-        config: FlamingoConfig,
-    ):
-        super().__init__(config)
-        # TODO: hardcode right because autoXXX is too slow
-        # vision_encoder = AutoModel.from_config(config.vision_config).vision_model
-        # lang_encoder = AutoModelForCausalLM.from_config(config.text_config)
-        # text_tokenizer = AutoTokenizer.from_pretrained(config.text_config._name_or_path)
-
-        ### TODO: give "LlamaForCausalLM" as the name of text_config.architectures of Llama_based flamingo
-        # assert hasattr(config.text_config, "_name_or_path")
-        # if "llama" not in config.text_config._name_or_path.lower():
-        if config.text_config.architectures[0] == "MPTForCausalLM":
-            text_tokenizer = AutoTokenizer.from_pretrained("mosaicml/mpt-7b-instruct")
-            lang_encoder = MPTForCausalLM(config=config.text_config)
-        elif config.text_config.architectures[0] == "MosaicGPT":
-            text_tokenizer = AutoTokenizer.from_pretrained("mosaicml/mosaic-llama-redpajama-final-candidate")
-            lang_encoder = MosaicGPT(config=config.text_config)
-        elif config.text_config.architectures[0] == "RWForCausalLM":
-            text_tokenizer = AutoTokenizer.from_pretrained("PATH-TO-YOUR-FALCON")
-            lang_encoder = RWForCausalLM(config=config.text_config)
-        # TODO: what's the logic here?
-        elif config.text_config.architectures[0] == "LlamaForCausalLM":
-            text_tokenizer = LlamaTokenizer.from_pretrained(config.text_config._name_or_path)
-            lang_encoder = LlamaForCausalLM(config=config.text_config)
-        else:
-            import pdb
-
-            pdb.set_trace()
-        # else:
-        #     text_tokenizer = LlamaTokenizer.from_pretrained(config.text_config._name_or_path)
-        #     lang_encoder = LlamaForCausalLM(config=config.text_config)
-
-        vision_encoder = CLIPVisionModel(config=config.vision_config)
-        text_tokenizer.add_special_tokens({"additional_special_tokens": ["<|endofchunk|>", "<image>"]})
-        if text_tokenizer.pad_token is None:
-            text_tokenizer.add_special_tokens({"pad_token": "<PAD>"})
-        self.text_tokenizer = text_tokenizer
-        self.eoc_token_id = text_tokenizer.encode("<|endofchunk|>")[-1]
-        self.media_token_id = text_tokenizer.encode("<image>")[-1]
-
-        extend_instance(lang_encoder, FlamingoLMMixin)
-        decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
-        lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
-        if "LlamaForCausalLM" in lang_encoder.__class__.__name__:
-            lang_encoder.resize_token_embeddings(len(text_tokenizer))
-        self.lang_encoder = lang_encoder
-
-        self.cross_attn_every_n_layers = config.cross_attn_every_n_layers if hasattr(config, "cross_attn_every_n_layers") else 4
-        self.use_media_placement_augmentation = config.use_media_placement_augmentation
-
-        vision_encoder.output_tokens = True
-        self.vision_encoder = vision_encoder
-
-        self.vis_dim = 1024
-        self.perceiver = FlamingoPerceiverResampler(dim=self.vis_dim)
-
-        self.lang_encoder.init_flamingo(
-            media_token_id=self.media_token_id,
-            vis_hidden_size=self.vis_dim,
-            cross_attn_every_n_layers=self.cross_attn_every_n_layers,
-            use_media_placement_augmentation=self.use_media_placement_augmentation,
-        )
-        self.post_init()
-
-    def get_input_embeddings(self) -> nn.Module:
-        return self.lang_encoder.get_input_embeddings()
-
-    def set_input_embeddings(self, new_embeddings):
-        self.lang_encoder.set_input_embeddings(new_embeddings)
-
-    def get_output_embeddings(self) -> nn.Module:
-        return self.lang_encoder.get_output_embeddings()
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lang_encoder.set_output_embeddings(new_embeddings)
-
-    def get_image_encoder(self) -> nn.Module:
-        return self.vision_encoder
-
-    def get_lang_encoder(self) -> nn.Module:
-        return self.lang_encoder
-
-    def init_weights(self):
-        # Freeze all parameters in vision encoder
-        for param in self.vision_encoder.parameters():
-            param.requires_grad = False
-        # Freeze all parameters in lang encoders except gated_cross_attn_layers
-        for name, param in self.lang_encoder.named_parameters():
-            if "gated_cross_attn_layer" not in name:
-                param.requires_grad = False
-        # Unfreeze LM input embeddings
-        self.lang_encoder.get_input_embeddings().requires_grad_(True)
-        ## MPTForCausalLM is tied word embedding
-        if "LlamaForCausalLM" in self.lang_encoder.__class__.__name__:
-            self.lang_encoder.lm_head.requires_grad_(True)
-        # assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0
-        # print model size in billions of parameters in 2 decimal places
-        print("====================Model Grad Part====================")
-        total_params = 0
-        for name, param in self.named_parameters():
-            if param.requires_grad:
-                total_params += param.numel()
-                print(f"Parameter: {name}, Size: {param.numel() / 1e6:.6f} M")
-        print(f"Total Trainable param: {total_params / 1e9:.4f} B")
-        print(f"Total Trainable param: {(sum(p.numel() for p in self.parameters() if p.requires_grad)) / 1e9:.3f} B")
-
-    def forward(
-        self,
-        vision_x: torch.Tensor,
-        lang_x: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cached_vision_x: bool = False,
-        clear_conditioned_layers: bool = True,
-        past_key_values: Optional[torch.Tensor] = None,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> CausalLMOutputWithPast:
-        """
-        Forward pass of Flamingo.
-
-        Args:
-            vision_x (torch.Tensor): Vision input
-                shape (B, T_img, F, C, H, W) with F=1
-            lang_x (torch.Tensor): Language input ids
-                shape (B, T_txt)
-            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
-            labels (torch.Tensor, optional): Labels. Defaults to None.
-            clear_conditioned_layers: if True, clear the conditioned layers
-                once the foward pass is completed. Set this to false if the
-                same set of images will be reused in another subsequent
-                forward pass.
-            past_key_values: pre-computed values to pass to language model.
-                See past_key_values documentation in Hugging Face
-                CausalLM models.
-            use_cache: whether to use cached key values. See use_cache
-                documentation in Hugging Face CausalLM models.
-        """
-        assert (vision_x is not None) or use_cached_vision_x, "Must provide either vision_x or use_cached_vision_x to True."
-
-        if use_cached_vision_x:
-            # Case: use cached; vision_x should be cached and other
-            # vision-related inputs should not be provided.
-            assert vision_x is None, "Expect vision_x to be None when use_cached_vision_x is True."
-            assert self.lang_encoder.is_conditioned()
-
-        else:
-            # Case: do not use caching (i.e. this is a standard forward pass);
-            self._encode_vision_x(vision_x=vision_x)
-
-        output = self.lang_encoder(
-            input_ids=lang_x,
-            attention_mask=attention_mask,
-            labels=labels,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            **kwargs,
-        )
-
-        if clear_conditioned_layers:
-            self.lang_encoder.clear_conditioned_layers()
-
-        return output
-
-    def _encode_vision_x(self, vision_x: torch.Tensor):
-        """
-        Compute media tokens from vision input by passing it through vision encoder and conditioning language model.
-        Args:
-            vision_x (torch.Tensor): Vision input
-                shape (B, T_img, F, C, H, W)
-                Images in the same chunk are collated along T_img, and frames are collated along F
-                Currently only F=1 is supported (single-frame videos)
-
-        rearrange code based on https://github.com/dhansmair/flamingo-mini
-        """
-
-        assert vision_x.ndim == 6, "vision_x should be of shape (b, T_img, F, C, H, W)"
-        b, T, F = vision_x.shape[:3]
-        # assert F == 1, "Only single frame supported"
-
-        vision_x = rearrange(vision_x, "b T F c h w -> (b T F) c h w")
-        with torch.no_grad():
-            vision_x = self.vision_encoder(vision_x)[0][:, 1:, :]
-        vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F)
-
-        vision_x = self.perceiver(vision_x)  # reshapes to (b, T, n, d)
-
-        for layer in self.lang_encoder._get_decoder_layers():
-            layer.condition_vis_x(vision_x)
-
-    @torch.no_grad()
-    def generate(
-        self,
-        vision_x: torch.Tensor,
-        lang_x: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        num_beams: int = 1,
-        max_new_tokens: Optional[int] = None,
-        temperature: float = 1.0,
-        top_k: int = 0,
-        top_p: float = 1.0,
-        no_repeat_ngram_size: int = 0,
-        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
-        length_penalty: float = 1.0,
-        num_return_sequences: int = 1,
-        do_sample: bool = False,
-        early_stopping: bool = False,
-        **kwargs,
-    ):
-        """
-        Generate text conditioned on vision and language inputs.
-
-        Args:
-            vision_x (torch.Tensor): Vision input
-                shape (B, T_img, F, C, H, W)
-                images in the same chunk are collated along T_img, and frames are collated along F
-                currently only F=1 is supported (single-frame videos)
-            lang_x (torch.Tensor): Language input
-                shape (B, T_txt)
-            max_length (int, optional): Maximum length of the output. Defaults to None.
-            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
-            num_beams (int, optional): Number of beams. Defaults to 1.
-            max_new_tokens (int, optional): Maximum new tokens. Defaults to None.
-            temperature (float, optional): Temperature. Defaults to 1.0.
-            top_k (int, optional): Top k. Defaults to 0.
-            top_p (float, optional): Top p. Defaults to 1.0.
-            no_repeat_ngram_size (int, optional): No repeat ngram size. Defaults to 0.
-            length_penalty (float, optional): Length penalty. Defaults to 1.0.
-            num_return_sequences (int, optional): Number of return sequences. Defaults to 1.
-            do_sample (bool, optional): Do sample. Defaults to False.
-            early_stopping (bool, optional): Early stopping. Defaults to False.
-        Returns:
-            torch.Tensor: lang_x with generated tokens appended to it
-        """
-        if hasattr(self, "_hf_hook"):
-            # add a hook to make sure that the output of lang_encoder is mapped to the same device as the lang_x
-            hook = AlignDevicesHook(
-                execution_device=lang_x.device,
-                io_same_device=True,
-                place_submodules=False,
-            )
-            add_hook_to_module(self.lang_encoder, hook)
-        if num_beams > 1:
-            vision_x = vision_x.repeat_interleave(num_beams, dim=0)
-        self._encode_vision_x(vision_x=vision_x)
-        output = self.lang_encoder.generate(
-            lang_x,
-            attention_mask=attention_mask,
-            eos_token_id=self.eoc_token_id,
-            num_beams=num_beams,
-            max_new_tokens=max_new_tokens,
-            temperature=temperature,
-            top_k=top_k,
-            top_p=top_p,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            no_repeat_ngram_size=no_repeat_ngram_size,
-            length_penalty=length_penalty,
-            num_return_sequences=num_return_sequences,
-            do_sample=do_sample,
-            early_stopping=early_stopping,
-            **kwargs,
-        )
-
-        self.lang_encoder.clear_conditioned_layers()
-        return output
diff --git a/mllm/flamingo/mpt/__init__.py b/mllm/flamingo/mpt/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/mllm/flamingo/mpt/__pycache__/__init__.cpython-39.pyc b/mllm/flamingo/mpt/__pycache__/__init__.cpython-39.pyc
deleted file mode 100644
index c299b1d5f93001ee2a5f4d024770b5395b5a29e1..0000000000000000000000000000000000000000
Binary files a/mllm/flamingo/mpt/__pycache__/__init__.cpython-39.pyc and /dev/null differ
diff --git a/mllm/flamingo/mpt/__pycache__/attention.cpython-39.pyc b/mllm/flamingo/mpt/__pycache__/attention.cpython-39.pyc
deleted file mode 100644
index 65a3344120dbc51157c39028d56b6a4a5914d9d9..0000000000000000000000000000000000000000
Binary files a/mllm/flamingo/mpt/__pycache__/attention.cpython-39.pyc and /dev/null differ
diff --git a/mllm/flamingo/mpt/__pycache__/blocks.cpython-39.pyc b/mllm/flamingo/mpt/__pycache__/blocks.cpython-39.pyc
deleted file mode 100644
index f3e94661df5d3686611e95f997e5ba34b5313828..0000000000000000000000000000000000000000
Binary files a/mllm/flamingo/mpt/__pycache__/blocks.cpython-39.pyc and /dev/null differ
diff --git a/mllm/flamingo/mpt/__pycache__/configuration_mpt.cpython-39.pyc b/mllm/flamingo/mpt/__pycache__/configuration_mpt.cpython-39.pyc
deleted file mode 100644
index bda0ded2f7ac61551c381a148ad52b02ff81fa27..0000000000000000000000000000000000000000
Binary files a/mllm/flamingo/mpt/__pycache__/configuration_mpt.cpython-39.pyc and /dev/null differ
diff --git a/mllm/flamingo/mpt/__pycache__/custom_embedding.cpython-39.pyc b/mllm/flamingo/mpt/__pycache__/custom_embedding.cpython-39.pyc
deleted file mode 100644
index 10495b174ad3e7addff16398f1446c1898a9c461..0000000000000000000000000000000000000000
Binary files a/mllm/flamingo/mpt/__pycache__/custom_embedding.cpython-39.pyc and /dev/null differ
diff --git a/mllm/flamingo/mpt/__pycache__/flash_attn_triton.cpython-39.pyc b/mllm/flamingo/mpt/__pycache__/flash_attn_triton.cpython-39.pyc
deleted file mode 100644
index 184c104b963890a3358b95198221d8921e825cd0..0000000000000000000000000000000000000000
Binary files a/mllm/flamingo/mpt/__pycache__/flash_attn_triton.cpython-39.pyc and /dev/null differ
diff --git a/mllm/flamingo/mpt/__pycache__/modeling_mpt.cpython-39.pyc b/mllm/flamingo/mpt/__pycache__/modeling_mpt.cpython-39.pyc
deleted file mode 100644
index 5970977120131734d1c5158c905794c7c13c398c..0000000000000000000000000000000000000000
Binary files a/mllm/flamingo/mpt/__pycache__/modeling_mpt.cpython-39.pyc and /dev/null differ
diff --git a/mllm/flamingo/mpt/__pycache__/norm.cpython-39.pyc b/mllm/flamingo/mpt/__pycache__/norm.cpython-39.pyc
deleted file mode 100644
index c21ee143fbfe11d67892068245c4d08476f3ed23..0000000000000000000000000000000000000000
Binary files a/mllm/flamingo/mpt/__pycache__/norm.cpython-39.pyc and /dev/null differ
diff --git a/mllm/flamingo/mpt/__pycache__/param_init_fns.cpython-39.pyc b/mllm/flamingo/mpt/__pycache__/param_init_fns.cpython-39.pyc
deleted file mode 100644
index 190195175f09bdacf28e5985127248c834656c98..0000000000000000000000000000000000000000
Binary files a/mllm/flamingo/mpt/__pycache__/param_init_fns.cpython-39.pyc and /dev/null differ
diff --git a/mllm/flamingo/mpt/adapt_tokenizer.py b/mllm/flamingo/mpt/adapt_tokenizer.py
deleted file mode 100644
index ea56c899d61fef932f1f1a42638cec0c046f3d9a..0000000000000000000000000000000000000000
--- a/mllm/flamingo/mpt/adapt_tokenizer.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from typing import Union
-from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
-
-Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
-NUM_SENTINEL_TOKENS: int = 100
-
-
-def adapt_tokenizer_for_denoising(tokenizer: Tokenizer):
-    """Adds sentinel tokens and padding token (if missing).
-
-    Expands the tokenizer vocabulary to include sentinel tokens
-    used in mixture-of-denoiser tasks as well as a padding token.
-
-    All added tokens are added as special tokens. No tokens are
-    added if sentinel tokens and padding token already exist.
-    """
-    sentinels_to_add = [f"<extra_id_{i}>" for i in range(NUM_SENTINEL_TOKENS)]
-    tokenizer.add_tokens(sentinels_to_add, special_tokens=True)
-    if tokenizer.pad_token is None:
-        tokenizer.add_tokens("<pad>", special_tokens=True)
-        tokenizer.pad_token = "<pad>"
-        assert tokenizer.pad_token_id is not None
-    sentinels = "".join([f"<extra_id_{i}>" for i in range(NUM_SENTINEL_TOKENS)])
-    _sentinel_token_ids = tokenizer(sentinels, add_special_tokens=False).input_ids
-    tokenizer.sentinel_token_ids = _sentinel_token_ids
-
-
-class AutoTokenizerForMOD(AutoTokenizer):
-    """AutoTokenizer + Adaptation for MOD.
-
-    A simple wrapper around AutoTokenizer to make instantiating
-    an MOD-adapted tokenizer a bit easier.
-
-    MOD-adapted tokenizers have sentinel tokens (e.g., <extra_id_0>),
-    a padding token, and a property to get the token ids of the
-    sentinel tokens.
-    """
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        """See `AutoTokenizer.from_pretrained` docstring."""
-        tokenizer = super().from_pretrained(*args, **kwargs)
-        adapt_tokenizer_for_denoising(tokenizer)
-        return tokenizer
diff --git a/mllm/flamingo/mpt/attention.py b/mllm/flamingo/mpt/attention.py
deleted file mode 100644
index 61fcfdeea29eb46189dd29bb20fa3ec52388668f..0000000000000000000000000000000000000000
--- a/mllm/flamingo/mpt/attention.py
+++ /dev/null
@@ -1,450 +0,0 @@
-"""Attention layers."""
-import math
-import warnings
-from typing import Optional
-import torch
-import torch.nn as nn
-from einops import rearrange
-from packaging import version
-from torch import nn
-from .norm import LPLayerNorm
-
-
-def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool):
-    if original_is_causal and num_query_tokens != num_key_tokens:
-        if num_query_tokens != 1:
-            raise NotImplementedError("MPT does not support query and key with different number of tokens, unless number of query tokens is 1.")
-        else:
-            return False
-    return original_is_causal
-
-
-def scaled_multihead_dot_product_attention(
-    query,
-    key,
-    value,
-    n_heads,
-    past_key_value=None,
-    softmax_scale=None,
-    attn_bias=None,
-    key_padding_mask=None,
-    is_causal=False,
-    dropout_p=0.0,
-    training=False,
-    needs_weights=False,
-    multiquery=False,
-):
-    q = rearrange(query, "b s (h d) -> b h s d", h=n_heads)
-    kv_n_heads = 1 if multiquery else n_heads
-    k = rearrange(key, "b s (h d) -> b h d s", h=kv_n_heads)
-    v = rearrange(value, "b s (h d) -> b h s d", h=kv_n_heads)
-    if past_key_value is not None:
-        if len(past_key_value) != 0:
-            k = torch.cat([past_key_value[0], k], dim=3)
-            v = torch.cat([past_key_value[1], v], dim=2)
-        past_key_value = (k, v)
-    (b, _, s_q, d) = q.shape
-    s_k = k.size(-1)
-    if softmax_scale is None:
-        softmax_scale = 1 / math.sqrt(d)
-    attn_weight = q.matmul(k) * softmax_scale
-    if attn_bias is not None:
-        _s_q = max(0, attn_bias.size(2) - s_q)
-        _s_k = max(0, attn_bias.size(3) - s_k)
-        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
-        if attn_bias.size(-1) != 1 and attn_bias.size(-1) != s_k or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q):
-            raise RuntimeError(f"attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.")
-        attn_weight = attn_weight + attn_bias
-    min_val = torch.finfo(q.dtype).min
-    if key_padding_mask is not None:
-        if attn_bias is not None:
-            warnings.warn(
-                "Propogating key_padding_mask to the attention module "
-                + "and applying it within the attention module can cause "
-                + "unneccessary computation/memory usage. Consider integrating "
-                + "into attn_bias once and passing that to each attention "
-                + "module instead."
-            )
-        attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
-    if is_causal and (not q.size(2) == 1):
-        s = max(s_q, s_k)
-        causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
-        causal_mask = causal_mask.tril()
-        causal_mask = causal_mask.to(torch.bool)
-        causal_mask = ~causal_mask
-        causal_mask = causal_mask[-s_q:, -s_k:]
-        attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k), min_val)
-    attn_weight = torch.softmax(attn_weight, dim=-1)
-    if dropout_p:
-        attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
-    out = attn_weight.to(v.dtype).matmul(v)
-    out = rearrange(out, "b h s d -> b s (h d)")
-    if needs_weights:
-        return (out, attn_weight, past_key_value)
-    return (out, None, past_key_value)
-
-
-def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
-    for tensor in tensors:
-        if tensor.dtype not in valid_dtypes:
-            raise TypeError(f"tensor.dtype={tensor.dtype!r} must be in valid_dtypes={valid_dtypes!r}.")
-        if not tensor.is_cuda:
-            raise TypeError(f"Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r}).")
-
-
-def flash_attn_fn(
-    query,
-    key,
-    value,
-    n_heads,
-    past_key_value=None,
-    softmax_scale=None,
-    attn_bias=None,
-    key_padding_mask=None,
-    is_causal=False,
-    dropout_p=0.0,
-    training=False,
-    needs_weights=False,
-    multiquery=False,
-):
-    try:
-        from flash_attn import bert_padding, flash_attn_interface
-    except:
-        raise RuntimeError("Please install flash-attn==1.0.3.post0")
-    check_valid_inputs(query, key, value)
-    if past_key_value is not None:
-        if len(past_key_value) != 0:
-            key = torch.cat([past_key_value[0], key], dim=1)
-            value = torch.cat([past_key_value[1], value], dim=1)
-        past_key_value = (key, value)
-    if attn_bias is not None:
-        _s_q = max(0, attn_bias.size(2) - query.size(1))
-        _s_k = max(0, attn_bias.size(3) - key.size(1))
-        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
-    if attn_bias is not None:
-        raise NotImplementedError(f"attn_bias not implemented for flash attn.")
-    (batch_size, seqlen) = query.shape[:2]
-    if key_padding_mask is None:
-        key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
-    query_padding_mask = key_padding_mask[:, -query.size(1) :]
-    (query_unpad, indices_q, cu_seqlens_q, max_seqlen_q) = bert_padding.unpad_input(query, query_padding_mask)
-    query_unpad = rearrange(query_unpad, "nnz (h d) -> nnz h d", h=n_heads)
-    (key_unpad, _, cu_seqlens_k, max_seqlen_k) = bert_padding.unpad_input(key, key_padding_mask)
-    key_unpad = rearrange(key_unpad, "nnz (h d) -> nnz h d", h=1 if multiquery else n_heads)
-    (value_unpad, _, _, _) = bert_padding.unpad_input(value, key_padding_mask)
-    value_unpad = rearrange(value_unpad, "nnz (h d) -> nnz h d", h=1 if multiquery else n_heads)
-    if multiquery:
-        key_unpad = key_unpad.expand(key_unpad.size(0), n_heads, key_unpad.size(-1))
-        value_unpad = value_unpad.expand(value_unpad.size(0), n_heads, value_unpad.size(-1))
-    dropout_p = dropout_p if training else 0.0
-    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
-    output_unpad = flash_attn_interface.flash_attn_unpadded_func(
-        query_unpad,
-        key_unpad,
-        value_unpad,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale=softmax_scale,
-        causal=reset_is_causal,
-        return_attn_probs=needs_weights,
-    )
-    output = bert_padding.pad_input(rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices_q, batch_size, seqlen)
-    return (output, None, past_key_value)
-
-
-def triton_flash_attn_fn(
-    query,
-    key,
-    value,
-    n_heads,
-    past_key_value=None,
-    softmax_scale=None,
-    attn_bias=None,
-    key_padding_mask=None,
-    is_causal=False,
-    dropout_p=0.0,
-    training=False,
-    needs_weights=False,
-    multiquery=False,
-):
-    try:
-        from .flash_attn_triton import flash_attn_func
-    except:
-        _installed = False
-        if version.parse(torch.__version__) < version.parse("2.0.0"):
-            _installed = True
-            try:
-                from flash_attn.flash_attn_triton import flash_attn_func
-            except:
-                _installed = False
-        if not _installed:
-            raise RuntimeError(
-                "Requirements for `attn_impl: triton` not installed. Either (1) have a CUDA-compatible GPU and `pip install .[gpu]` if installing from llm-foundry source or `pip install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python` if installing from pypi, or (2) use torch attn model.attn_config.attn_impl=torch (torch attn_impl will be slow). Note: (1) requires you have CMake and PyTorch already installed."
-            )
-    check_valid_inputs(query, key, value)
-    if past_key_value is not None:
-        if len(past_key_value) != 0:
-            key = torch.cat([past_key_value[0], key], dim=1)
-            value = torch.cat([past_key_value[1], value], dim=1)
-        past_key_value = (key, value)
-    if attn_bias is not None:
-        _s_q = max(0, attn_bias.size(2) - query.size(1))
-        _s_k = max(0, attn_bias.size(3) - key.size(1))
-        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
-    if dropout_p:
-        raise NotImplementedError(f"Dropout not implemented for attn_impl: triton.")
-    if needs_weights:
-        raise NotImplementedError(f"attn_impl: triton cannot return attn weights.")
-    if key_padding_mask is not None:
-        warnings.warn(
-            "Propagating key_padding_mask to the attention module "
-            + "and applying it within the attention module can cause "
-            + "unnecessary computation/memory usage. Consider integrating "
-            + "into attn_bias once and passing that to each attention "
-            + "module instead."
-        )
-        (b_size, s_k) = key_padding_mask.shape[:2]
-        if attn_bias is None:
-            attn_bias = query.new_zeros(b_size, 1, 1, s_k)
-        attn_bias = attn_bias.masked_fill(~key_padding_mask.view((b_size, 1, 1, s_k)), torch.finfo(query.dtype).min)
-    query = rearrange(query, "b s (h d) -> b s h d", h=n_heads)
-    key = rearrange(key, "b s (h d) -> b s h d", h=1 if multiquery else n_heads)
-    value = rearrange(value, "b s (h d) -> b s h d", h=1 if multiquery else n_heads)
-    if multiquery:
-        key = key.expand(*key.shape[:2], n_heads, key.size(-1))
-        value = value.expand(*value.shape[:2], n_heads, value.size(-1))
-    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
-    attn_output = flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
-    output = attn_output.view(*attn_output.shape[:2], -1)
-    return (output, None, past_key_value)
-
-
-class MultiheadAttention(nn.Module):
-    """Multi-head self attention.
-
-    Using torch or triton attention implemetation enables user to also use
-    additive bias.
-    """
-
-    def __init__(
-        self,
-        d_model: int,
-        n_heads: int,
-        attn_impl: str = "triton",
-        clip_qkv: Optional[float] = None,
-        qk_ln: bool = False,
-        softmax_scale: Optional[float] = None,
-        attn_pdrop: float = 0.0,
-        low_precision_layernorm: bool = False,
-        verbose: int = 0,
-        device: Optional[str] = None,
-    ):
-        super().__init__()
-        self.attn_impl = attn_impl
-        self.clip_qkv = clip_qkv
-        self.qk_ln = qk_ln
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.softmax_scale = softmax_scale
-        if self.softmax_scale is None:
-            self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
-        self.attn_dropout_p = attn_pdrop
-        self.Wqkv = nn.Linear(self.d_model, 3 * self.d_model, device=device)
-        fuse_splits = (d_model, 2 * d_model)
-        self.Wqkv._fused = (0, fuse_splits)
-        if self.qk_ln:
-            layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
-            self.q_ln = layernorm_class(self.d_model, device=device)
-            self.k_ln = layernorm_class(self.d_model, device=device)
-        if self.attn_impl == "flash":
-            self.attn_fn = flash_attn_fn
-        elif self.attn_impl == "triton":
-            self.attn_fn = triton_flash_attn_fn
-            if verbose:
-                warnings.warn(
-                    "While `attn_impl: triton` can be faster than `attn_impl: flash` "
-                    + "it uses more memory. When training larger models this can trigger "
-                    + "alloc retries which hurts performance. If encountered, we recommend "
-                    + "using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`."
-                )
-        elif self.attn_impl == "torch":
-            self.attn_fn = scaled_multihead_dot_product_attention
-            if torch.cuda.is_available() and verbose:
-                warnings.warn(
-                    "Using `attn_impl: torch`. If your model does not use `alibi` or "
-                    + "`prefix_lm` we recommend using `attn_impl: flash` otherwise "
-                    + "we recommend using `attn_impl: triton`."
-                )
-        else:
-            raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
-        self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
-        self.out_proj._is_residual = True
-
-    def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
-        qkv = self.Wqkv(x)
-        if self.clip_qkv:
-            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
-        (query, key, value) = qkv.chunk(3, dim=2)
-        key_padding_mask = attention_mask
-        if self.qk_ln:
-            dtype = query.dtype
-            query = self.q_ln(query).to(dtype)
-            key = self.k_ln(key).to(dtype)
-        (context, attn_weights, past_key_value) = self.attn_fn(
-            query,
-            key,
-            value,
-            self.n_heads,
-            past_key_value=past_key_value,
-            softmax_scale=self.softmax_scale,
-            attn_bias=attn_bias,
-            key_padding_mask=key_padding_mask,
-            is_causal=is_causal,
-            dropout_p=self.attn_dropout_p,
-            training=self.training,
-            needs_weights=needs_weights,
-        )
-        return (self.out_proj(context), attn_weights, past_key_value)
-
-
-class MultiQueryAttention(nn.Module):
-    """Multi-Query self attention.
-
-    Using torch or triton attention implemetation enables user to also use
-    additive bias.
-    """
-
-    def __init__(
-        self,
-        d_model: int,
-        n_heads: int,
-        attn_impl: str = "triton",
-        clip_qkv: Optional[float] = None,
-        qk_ln: bool = False,
-        softmax_scale: Optional[float] = None,
-        attn_pdrop: float = 0.0,
-        low_precision_layernorm: bool = False,
-        verbose: int = 0,
-        device: Optional[str] = None,
-    ):
-        super().__init__()
-        self.attn_impl = attn_impl
-        self.clip_qkv = clip_qkv
-        self.qk_ln = qk_ln
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.head_dim = d_model // n_heads
-        self.softmax_scale = softmax_scale
-        if self.softmax_scale is None:
-            self.softmax_scale = 1 / math.sqrt(self.head_dim)
-        self.attn_dropout_p = attn_pdrop
-        self.Wqkv = nn.Linear(d_model, d_model + 2 * self.head_dim, device=device)
-        fuse_splits = (d_model, d_model + self.head_dim)
-        self.Wqkv._fused = (0, fuse_splits)
-        if self.qk_ln:
-            layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
-            self.q_ln = layernorm_class(d_model, device=device)
-            self.k_ln = layernorm_class(self.head_dim, device=device)
-        if self.attn_impl == "flash":
-            self.attn_fn = flash_attn_fn
-        elif self.attn_impl == "triton":
-            self.attn_fn = triton_flash_attn_fn
-            if verbose:
-                warnings.warn(
-                    "While `attn_impl: triton` can be faster than `attn_impl: flash` "
-                    + "it uses more memory. When training larger models this can trigger "
-                    + "alloc retries which hurts performance. If encountered, we recommend "
-                    + "using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`."
-                )
-        elif self.attn_impl == "torch":
-            self.attn_fn = scaled_multihead_dot_product_attention
-            if torch.cuda.is_available() and verbose:
-                warnings.warn(
-                    "Using `attn_impl: torch`. If your model does not use `alibi` or "
-                    + "`prefix_lm` we recommend using `attn_impl: flash` otherwise "
-                    + "we recommend using `attn_impl: triton`."
-                )
-        else:
-            raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
-        self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
-        self.out_proj._is_residual = True
-
-    def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
-        qkv = self.Wqkv(x)
-        if self.clip_qkv:
-            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
-        (query, key, value) = qkv.split([self.d_model, self.head_dim, self.head_dim], dim=2)
-        key_padding_mask = attention_mask
-        if self.qk_ln:
-            dtype = query.dtype
-            query = self.q_ln(query).to(dtype)
-            key = self.k_ln(key).to(dtype)
-        (context, attn_weights, past_key_value) = self.attn_fn(
-            query,
-            key,
-            value,
-            self.n_heads,
-            past_key_value=past_key_value,
-            softmax_scale=self.softmax_scale,
-            attn_bias=attn_bias,
-            key_padding_mask=key_padding_mask,
-            is_causal=is_causal,
-            dropout_p=self.attn_dropout_p,
-            training=self.training,
-            needs_weights=needs_weights,
-            multiquery=True,
-        )
-        return (self.out_proj(context), attn_weights, past_key_value)
-
-
-def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id):
-    if attn_impl == "flash":
-        return None
-    elif attn_impl in ["torch", "triton"]:
-        if alibi:
-            if (prefix_lm or not causal) or use_sequence_id:
-                return (1, n_heads, seq_len, seq_len)
-            return (1, n_heads, 1, seq_len)
-        elif prefix_lm or use_sequence_id:
-            return (1, 1, seq_len, seq_len)
-        return None
-    else:
-        raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
-
-
-def build_attn_bias(attn_impl, attn_bias, n_heads, seq_len, causal=False, alibi=False, alibi_bias_max=8):
-    if attn_impl == "flash":
-        return None
-    elif attn_impl in ["torch", "triton"]:
-        if alibi:
-            (device, dtype) = (attn_bias.device, attn_bias.dtype)
-            attn_bias = attn_bias.add(build_alibi_bias(n_heads, seq_len, full=not causal, alibi_bias_max=alibi_bias_max, device=device, dtype=dtype))
-        return attn_bias
-    else:
-        raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
-
-
-def gen_slopes(n_heads, alibi_bias_max=8, device=None):
-    _n_heads = 2 ** math.ceil(math.log2(n_heads))
-    m = torch.arange(1, _n_heads + 1, dtype=torch.float32, device=device)
-    m = m.mul(alibi_bias_max / _n_heads)
-    slopes = 1.0 / torch.pow(2, m)
-    if _n_heads != n_heads:
-        slopes = torch.concat([slopes[1::2], slopes[::2]])[:n_heads]
-    return slopes.view(1, n_heads, 1, 1)
-
-
-def build_alibi_bias(n_heads, seq_len, full=False, alibi_bias_max=8, device=None, dtype=None):
-    alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, 1, seq_len)
-    if full:
-        alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, seq_len, 1)
-        alibi_bias = alibi_bias.abs().mul(-1)
-    slopes = gen_slopes(n_heads, alibi_bias_max, device=device)
-    alibi_bias = alibi_bias * slopes
-    return alibi_bias.to(dtype=dtype)
-
-
-ATTN_CLASS_REGISTRY = {"multihead_attention": MultiheadAttention, "multiquery_attention": MultiQueryAttention}
diff --git a/mllm/flamingo/mpt/blocks.py b/mllm/flamingo/mpt/blocks.py
deleted file mode 100644
index dc16f5daa19f5f720b522eb2546edafea8f007a1..0000000000000000000000000000000000000000
--- a/mllm/flamingo/mpt/blocks.py
+++ /dev/null
@@ -1,82 +0,0 @@
-"""GPT Blocks used for the GPT Model."""
-from typing import Dict, Optional, Tuple
-import torch
-import torch.nn as nn
-from .attention import ATTN_CLASS_REGISTRY
-from .norm import NORM_CLASS_REGISTRY
-
-
-class MPTMLP(nn.Module):
-    def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str] = None):
-        super().__init__()
-        self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
-        ## yh: hard code
-        # self.act = nn.GELU(approximate='none')
-        self.act = nn.GELU()
-        self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
-        self.down_proj._is_residual = True
-
-    def forward(self, x):
-        return self.down_proj(self.act(self.up_proj(x)))
-
-
-class MPTBlock(nn.Module):
-    def __init__(
-        self,
-        d_model: int,
-        n_heads: int,
-        expansion_ratio: int,
-        attn_config: Dict = {
-            "attn_type": "multihead_attention",
-            "attn_pdrop": 0.0,
-            "attn_impl": "triton",
-            "qk_ln": False,
-            "clip_qkv": None,
-            "softmax_scale": None,
-            "prefix_lm": False,
-            "attn_uses_sequence_id": False,
-            "alibi": False,
-            "alibi_bias_max": 8,
-        },
-        resid_pdrop: float = 0.0,
-        norm_type: str = "low_precision_layernorm",
-        verbose: int = 0,
-        device: Optional[str] = None,
-        **kwargs
-    ):
-        del kwargs
-        super().__init__()
-        norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
-        attn_class = ATTN_CLASS_REGISTRY[attn_config["attn_type"]]
-        self.norm_1 = norm_class(d_model, device=device)
-        self.attn = attn_class(
-            attn_impl=attn_config["attn_impl"],
-            clip_qkv=attn_config["clip_qkv"],
-            qk_ln=attn_config["qk_ln"],
-            softmax_scale=attn_config["softmax_scale"],
-            attn_pdrop=attn_config["attn_pdrop"],
-            d_model=d_model,
-            n_heads=n_heads,
-            verbose=verbose,
-            device=device,
-        )
-        self.norm_2 = norm_class(d_model, device=device)
-        self.ffn = MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, device=device)
-        self.resid_attn_dropout = nn.Dropout(resid_pdrop)
-        self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attn_bias: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.ByteTensor] = None,
-        is_causal: bool = True,
-    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
-        a = self.norm_1(x)
-        (b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
-        x = x + self.resid_attn_dropout(b)
-        m = self.norm_2(x)
-        n = self.ffn(m)
-        x = x + self.resid_ffn_dropout(n)
-        return (x, attn_weights, past_key_value)
diff --git a/mllm/flamingo/mpt/configuration_mpt.py b/mllm/flamingo/mpt/configuration_mpt.py
deleted file mode 100644
index 8b35ee0170e849e65ac1019d6b89db95e40b622c..0000000000000000000000000000000000000000
--- a/mllm/flamingo/mpt/configuration_mpt.py
+++ /dev/null
@@ -1,161 +0,0 @@
-"""A HuggingFace-style model configuration."""
-from typing import Dict, Optional, Union
-from transformers import PretrainedConfig
-
-attn_config_defaults: Dict = {
-    "attn_type": "multihead_attention",
-    "attn_pdrop": 0.0,
-    "attn_impl": "triton",
-    "qk_ln": False,
-    "clip_qkv": None,
-    "softmax_scale": None,
-    "prefix_lm": False,
-    "attn_uses_sequence_id": False,
-    "alibi": False,
-    "alibi_bias_max": 8,
-}
-init_config_defaults: Dict = {
-    "name": "kaiming_normal_",
-    "fan_mode": "fan_in",
-    "init_nonlinearity": "relu",
-    "init_div_is_residual": True,
-    "emb_init_std": None,
-    "emb_init_uniform_lim": None,
-    "init_std": None,
-    "init_gain": 0.0,
-}
-
-
-class MPTConfig(PretrainedConfig):
-    model_type = "mpt"
-
-    def __init__(
-        self,
-        d_model: int = 2048,
-        n_heads: int = 16,
-        n_layers: int = 24,
-        expansion_ratio: int = 4,
-        max_seq_len: int = 2048,
-        vocab_size: int = 50368,
-        resid_pdrop: float = 0.0,
-        emb_pdrop: float = 0.0,
-        learned_pos_emb: bool = True,
-        attn_config: Dict = attn_config_defaults,
-        init_device: str = "cpu",
-        logit_scale: Optional[Union[float, str]] = None,
-        no_bias: bool = False,
-        verbose: int = 0,
-        embedding_fraction: float = 1.0,
-        norm_type: str = "low_precision_layernorm",
-        use_cache: bool = False,
-        init_config: Dict = init_config_defaults,
-        **kwargs,
-    ):
-        """The MPT configuration class.
-
-        Args:
-            d_model (int): The size of the embedding dimension of the model.
-            n_heads (int): The number of attention heads.
-            n_layers (int): The number of layers in the model.
-            expansion_ratio (int): The ratio of the up/down scale in the MLP.
-            max_seq_len (int): The maximum sequence length of the model.
-            vocab_size (int): The size of the vocabulary.
-            resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
-            emb_pdrop (float): The dropout probability for the embedding layer.
-            learned_pos_emb (bool): Whether to use learned positional embeddings
-            attn_config (Dict):  A dictionary used to configure the model's attention module:
-                attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
-                attn_pdrop (float): The dropout probability for the attention layers.
-                attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
-                qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
-                clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
-                    this value.
-                softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
-                    use the default scale of ``1/sqrt(d_keys)``.
-                prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
-                    extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
-                    can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
-                attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
-                    When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
-                    which sub-sequence each token belongs to.
-                    Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
-                alibi (bool): Whether to use the alibi bias instead of position embeddings.
-                alibi_bias_max (int): The maximum value of the alibi bias.
-            init_device (str): The device to use for parameter initialization.
-            logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
-            no_bias (bool): Whether to use bias in all layers.
-            verbose (int): The verbosity level. 0 is silent.
-            embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
-            norm_type (str): choose type of norm to use
-            multiquery_attention (bool): Whether to use multiquery attention implementation.
-            use_cache (bool): Whether or not the model should return the last key/values attentions
-            init_config (Dict): A dictionary used to configure the model initialization:
-                init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
-                    'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
-                    'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
-                init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
-                emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
-                emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
-                    used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
-                init_std (float): The standard deviation of the normal distribution used to initialize the model,
-                    if using the baseline_ parameter initialization scheme.
-                init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
-                fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
-                init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
-                ---
-                See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
-        """
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.expansion_ratio = expansion_ratio
-        self.max_seq_len = max_seq_len
-        self.vocab_size = vocab_size
-        self.resid_pdrop = resid_pdrop
-        self.emb_pdrop = emb_pdrop
-        self.learned_pos_emb = learned_pos_emb
-        self.attn_config = attn_config
-        self.init_device = init_device
-        self.logit_scale = logit_scale
-        self.no_bias = no_bias
-        self.verbose = verbose
-        self.embedding_fraction = embedding_fraction
-        self.norm_type = norm_type
-        self.use_cache = use_cache
-        self.init_config = init_config
-        if "name" in kwargs:
-            del kwargs["name"]
-        if "loss_fn" in kwargs:
-            del kwargs["loss_fn"]
-        super().__init__(**kwargs)
-        self._validate_config()
-
-    def _set_config_defaults(self, config, config_defaults):
-        for k, v in config_defaults.items():
-            if k not in config:
-                config[k] = v
-        return config
-
-    def _validate_config(self):
-        self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
-        self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
-        if self.d_model % self.n_heads != 0:
-            raise ValueError("d_model must be divisible by n_heads")
-        if any((prob < 0 or prob > 1 for prob in [self.attn_config["attn_pdrop"], self.resid_pdrop, self.emb_pdrop])):
-            raise ValueError("self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1")
-        if self.attn_config["attn_impl"] not in ["torch", "flash", "triton"]:
-            raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}")
-        if self.attn_config["prefix_lm"] and self.attn_config["attn_impl"] not in ["torch", "triton"]:
-            raise NotImplementedError("prefix_lm only implemented with torch and triton attention.")
-        if self.attn_config["alibi"] and self.attn_config["attn_impl"] not in ["torch", "triton"]:
-            raise NotImplementedError("alibi only implemented with torch and triton attention.")
-        if self.attn_config["attn_uses_sequence_id"] and self.attn_config["attn_impl"] not in ["torch", "triton"]:
-            raise NotImplementedError("attn_uses_sequence_id only implemented with torch and triton attention.")
-        if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
-            raise ValueError("model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!")
-        if isinstance(self.logit_scale, str) and self.logit_scale != "inv_sqrt_d_model":
-            raise ValueError(f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
-        if self.init_config.get("name", None) is None:
-            raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
-        if not self.learned_pos_emb and (not self.attn_config["alibi"]):
-            raise ValueError(f"Positional information must be provided to the model using either learned_pos_emb or alibi.")
diff --git a/mllm/flamingo/mpt/custom_embedding.py b/mllm/flamingo/mpt/custom_embedding.py
deleted file mode 100644
index 83979e7e7d8552b32c97d3473d8fd4bb12bd45f3..0000000000000000000000000000000000000000
--- a/mllm/flamingo/mpt/custom_embedding.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch import Tensor
-
-
-class SharedEmbedding(nn.Embedding):
-    def forward(self, input: Tensor, unembed: bool = False) -> Tensor:
-        if unembed:
-            return F.linear(input, self.weight)
-        return super().forward(input)
diff --git a/mllm/flamingo/mpt/flash_attn_triton.py b/mllm/flamingo/mpt/flash_attn_triton.py
deleted file mode 100644
index 07277af6883c90201bf005ce5fbe3723c3caa2b3..0000000000000000000000000000000000000000
--- a/mllm/flamingo/mpt/flash_attn_triton.py
+++ /dev/null
@@ -1,841 +0,0 @@
-"""
-Copied from https://github.com/HazyResearch/flash-attention/blob/eff9fe6b8076df59d64d7a3f464696738a3c7c24/flash_attn/flash_attn_triton.py
-update imports to use 'triton_pre_mlir'
-
-*Experimental* implementation of FlashAttention in Triton.
-Tested with triton==2.0.0.dev20221202.
-Triton 2.0 has a new backend (MLIR) but seems like it doesn't yet work for head dimensions
-other than 64:
-https://github.com/openai/triton/blob/d376020f90002757eea3ea9475d4f7cfc2ec5ead/python/triton/ops/flash_attention.py#L207
-We'll update this implementation with the new Triton backend once this is fixed.
-
-We use the FlashAttention implementation from Phil Tillet a starting point.
-https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
-
-Changes:
-- Implement both causal and non-causal attention.
-- Implement both self-attention and cross-attention.
-- Support arbitrary seqlens (not just multiples of 128), for both forward and backward.
-- Support all head dimensions up to 128 (not just 16, 32, 64, 128), for both forward and backward.
-- Support attention bias.
-- Speed up the forward pass a bit, and only store the LSE instead of m and l.
-- Make the backward for d=128 much faster by reducing register spilling.
-- Optionally parallelize the backward pass across seqlen_k, to deal with the case of
-small batch size * nheads.
-
-Caution:
-- This is an *experimental* implementation. The forward pass should be quite robust but
-I'm not 100% sure that the backward pass doesn't have race conditions (due to the Triton compiler).
-- This implementation has only been tested on A100.
-- If you plan to use headdim other than 64 and 128, you should test for race conditions
-(due to the Triton compiler), as done in tests/test_flash_attn.py
-"test_flash_attn_triton_race_condition". I've tested and fixed many race conditions
-for different head dimensions (40, 48, 64, 128, 80, 88, 96), but I'm still not 100% confident
-that there are none left for other head dimensions.
-
-Differences between this Triton version and the CUDA version:
-- Triton version doesn't support dropout.
-- Triton forward is generally faster than CUDA forward, while Triton backward is
-generally slower than CUDA backward. Overall Triton forward + backward is slightly slower
-than CUDA forward + backward.
-- Triton version doesn't support different sequence lengths in a batch (i.e., RaggedTensor/NestedTensor).
-- Triton version supports attention bias, while CUDA version doesn't.
-"""
-import math
-import torch
-import triton_pre_mlir as triton
-import triton_pre_mlir.language as tl
-
-
-@triton.heuristics(
-    {
-        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
-        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
-        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
-    }
-)
-@triton.jit
-def _fwd_kernel(
-    Q,
-    K,
-    V,
-    Bias,
-    Out,
-    Lse,
-    TMP,
-    softmax_scale,
-    stride_qb,
-    stride_qh,
-    stride_qm,
-    stride_kb,
-    stride_kh,
-    stride_kn,
-    stride_vb,
-    stride_vh,
-    stride_vn,
-    stride_bb,
-    stride_bh,
-    stride_bm,
-    stride_ob,
-    stride_oh,
-    stride_om,
-    nheads,
-    seqlen_q,
-    seqlen_k,
-    seqlen_q_rounded,
-    headdim,
-    CACHE_KEY_SEQLEN_Q,
-    CACHE_KEY_SEQLEN_K,
-    BIAS_TYPE: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_HEADDIM: tl.constexpr,
-    EVEN_M: tl.constexpr,
-    EVEN_N: tl.constexpr,
-    EVEN_HEADDIM: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    start_m = tl.program_id(0)
-    off_hb = tl.program_id(1)
-    off_b = off_hb // nheads
-    off_h = off_hb % nheads
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_n = tl.arange(0, BLOCK_N)
-    offs_d = tl.arange(0, BLOCK_HEADDIM)
-    q_ptrs = Q + off_b * stride_qb + off_h * stride_qh + (offs_m[:, None] * stride_qm + offs_d[None, :])
-    k_ptrs = K + off_b * stride_kb + off_h * stride_kh + (offs_n[:, None] * stride_kn + offs_d[None, :])
-    v_ptrs = V + off_b * stride_vb + off_h * stride_vh + (offs_n[:, None] * stride_vn + offs_d[None, :])
-    if BIAS_TYPE == "vector":
-        b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + offs_n
-    elif BIAS_TYPE == "matrix":
-        b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + (offs_m[:, None] * stride_bm + offs_n[None, :])
-    t_ptrs = TMP + off_hb * seqlen_q_rounded + offs_m
-    lse_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-    acc_o = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32)
-    if EVEN_M & EVEN_N:
-        if EVEN_HEADDIM:
-            q = tl.load(q_ptrs)
-        else:
-            q = tl.load(q_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
-    elif EVEN_HEADDIM:
-        q = tl.load(q_ptrs, mask=offs_m[:, None] < seqlen_q, other=0.0)
-    else:
-        q = tl.load(q_ptrs, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0)
-    end_n = seqlen_k if not IS_CAUSAL else tl.minimum((start_m + 1) * BLOCK_M, seqlen_k)
-    for start_n in range(0, end_n, BLOCK_N):
-        start_n = tl.multiple_of(start_n, BLOCK_N)
-        if EVEN_N & EVEN_M:
-            if EVEN_HEADDIM:
-                k = tl.load(k_ptrs + start_n * stride_kn)
-            else:
-                k = tl.load(k_ptrs + start_n * stride_kn, mask=offs_d[None, :] < headdim, other=0.0)
-        elif EVEN_HEADDIM:
-            k = tl.load(k_ptrs + start_n * stride_kn, mask=(start_n + offs_n)[:, None] < seqlen_k, other=0.0)
-        else:
-            k = tl.load(k_ptrs + start_n * stride_kn, mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k, trans_b=True)
-        if not EVEN_N:
-            qk += tl.where((start_n + offs_n)[None, :] < seqlen_k, 0, float("-inf"))
-        if IS_CAUSAL:
-            qk += tl.where(offs_m[:, None] >= (start_n + offs_n)[None, :], 0, float("-inf"))
-        if BIAS_TYPE != "none":
-            if BIAS_TYPE == "vector":
-                if EVEN_N:
-                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
-                else:
-                    bias = tl.load(b_ptrs + start_n, mask=start_n + offs_n < seqlen_k, other=0.0).to(tl.float32)
-                bias = bias[None, :]
-            elif BIAS_TYPE == "matrix":
-                if EVEN_M & EVEN_N:
-                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
-                else:
-                    bias = tl.load(b_ptrs + start_n, mask=(offs_m[:, None] < seqlen_q) & ((start_n + offs_n)[None, :] < seqlen_k), other=0.0).to(tl.float32)
-            qk = qk * softmax_scale + bias
-            m_ij = tl.maximum(tl.max(qk, 1), lse_i)
-            p = tl.exp(qk - m_ij[:, None])
-        else:
-            m_ij = tl.maximum(tl.max(qk, 1) * softmax_scale, lse_i)
-            p = tl.exp(qk * softmax_scale - m_ij[:, None])
-        l_ij = tl.sum(p, 1)
-        acc_o_scale = tl.exp(m_i - m_ij)
-        tl.store(t_ptrs, acc_o_scale)
-        acc_o_scale = tl.load(t_ptrs)
-        acc_o = acc_o * acc_o_scale[:, None]
-        if EVEN_N & EVEN_M:
-            if EVEN_HEADDIM:
-                v = tl.load(v_ptrs + start_n * stride_vn)
-            else:
-                v = tl.load(v_ptrs + start_n * stride_vn, mask=offs_d[None, :] < headdim, other=0.0)
-        elif EVEN_HEADDIM:
-            v = tl.load(v_ptrs + start_n * stride_vn, mask=(start_n + offs_n)[:, None] < seqlen_k, other=0.0)
-        else:
-            v = tl.load(v_ptrs + start_n * stride_vn, mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
-        p = p.to(v.dtype)
-        acc_o += tl.dot(p, v)
-        m_i = m_ij
-        l_i_new = tl.exp(lse_i - m_ij) + l_ij
-        lse_i = m_ij + tl.log(l_i_new)
-    o_scale = tl.exp(m_i - lse_i)
-    tl.store(t_ptrs, o_scale)
-    o_scale = tl.load(t_ptrs)
-    acc_o = acc_o * o_scale[:, None]
-    start_m = tl.program_id(0)
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    lse_ptrs = Lse + off_hb * seqlen_q_rounded + offs_m
-    tl.store(lse_ptrs, lse_i)
-    offs_d = tl.arange(0, BLOCK_HEADDIM)
-    out_ptrs = Out + off_b * stride_ob + off_h * stride_oh + (offs_m[:, None] * stride_om + offs_d[None, :])
-    if EVEN_M:
-        if EVEN_HEADDIM:
-            tl.store(out_ptrs, acc_o)
-        else:
-            tl.store(out_ptrs, acc_o, mask=offs_d[None, :] < headdim)
-    elif EVEN_HEADDIM:
-        tl.store(out_ptrs, acc_o, mask=offs_m[:, None] < seqlen_q)
-    else:
-        tl.store(out_ptrs, acc_o, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim))
-
-
-@triton.jit
-def _bwd_preprocess_do_o_dot(
-    Out,
-    DO,
-    Delta,
-    stride_ob,
-    stride_oh,
-    stride_om,
-    stride_dob,
-    stride_doh,
-    stride_dom,
-    nheads,
-    seqlen_q,
-    seqlen_q_rounded,
-    headdim,
-    BLOCK_M: tl.constexpr,
-    BLOCK_HEADDIM: tl.constexpr,
-):
-    start_m = tl.program_id(0)
-    off_hb = tl.program_id(1)
-    off_b = off_hb // nheads
-    off_h = off_hb % nheads
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_d = tl.arange(0, BLOCK_HEADDIM)
-    o = tl.load(
-        Out + off_b * stride_ob + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :],
-        mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-        other=0.0,
-    ).to(tl.float32)
-    do = tl.load(
-        DO + off_b * stride_dob + off_h * stride_doh + offs_m[:, None] * stride_dom + offs_d[None, :],
-        mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-        other=0.0,
-    ).to(tl.float32)
-    delta = tl.sum(o * do, axis=1)
-    tl.store(Delta + off_hb * seqlen_q_rounded + offs_m, delta)
-
-
-@triton.jit
-def _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr):
-    if EVEN_N & EVEN_M:
-        if EVEN_HEADDIM:
-            tl.store(dv_ptrs, dv)
-            tl.store(dk_ptrs, dk)
-        else:
-            tl.store(dv_ptrs, dv, mask=offs_d[None, :] < headdim)
-            tl.store(dk_ptrs, dk, mask=offs_d[None, :] < headdim)
-    elif EVEN_HEADDIM:
-        tl.store(dv_ptrs, dv, mask=offs_n[:, None] < seqlen_k)
-        tl.store(dk_ptrs, dk, mask=offs_n[:, None] < seqlen_k)
-    else:
-        tl.store(dv_ptrs, dv, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
-        tl.store(dk_ptrs, dk, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
-
-
-@triton.jit
-def _bwd_kernel_one_col_block(
-    start_n,
-    Q,
-    K,
-    V,
-    Bias,
-    DO,
-    DQ,
-    DK,
-    DV,
-    LSE,
-    D,
-    softmax_scale,
-    stride_qm,
-    stride_kn,
-    stride_vn,
-    stride_bm,
-    stride_dom,
-    stride_dqm,
-    stride_dkn,
-    stride_dvn,
-    seqlen_q,
-    seqlen_k,
-    headdim,
-    ATOMIC_ADD: tl.constexpr,
-    BIAS_TYPE: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_HEADDIM: tl.constexpr,
-    EVEN_M: tl.constexpr,
-    EVEN_N: tl.constexpr,
-    EVEN_HEADDIM: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    begin_m = 0 if not IS_CAUSAL else start_n * BLOCK_N // BLOCK_M * BLOCK_M
-    offs_qm = begin_m + tl.arange(0, BLOCK_M)
-    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
-    offs_m = tl.arange(0, BLOCK_M)
-    offs_d = tl.arange(0, BLOCK_HEADDIM)
-    q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_d[None, :])
-    k_ptrs = K + (offs_n[:, None] * stride_kn + offs_d[None, :])
-    v_ptrs = V + (offs_n[:, None] * stride_vn + offs_d[None, :])
-    do_ptrs = DO + (offs_qm[:, None] * stride_dom + offs_d[None, :])
-    dq_ptrs = DQ + (offs_qm[:, None] * stride_dqm + offs_d[None, :])
-    if BIAS_TYPE == "vector":
-        b_ptrs = Bias + offs_n
-    elif BIAS_TYPE == "matrix":
-        b_ptrs = Bias + (offs_qm[:, None] * stride_bm + offs_n[None, :])
-    dv = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
-    dk = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
-    if begin_m >= seqlen_q:
-        dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
-        dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
-        _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM)
-        return
-    if EVEN_N & EVEN_M:
-        if EVEN_HEADDIM:
-            k = tl.load(k_ptrs)
-            v = tl.load(v_ptrs)
-        else:
-            k = tl.load(k_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
-            v = tl.load(v_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
-    elif EVEN_HEADDIM:
-        k = tl.load(k_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
-        v = tl.load(v_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
-    else:
-        k = tl.load(k_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
-        v = tl.load(v_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
-    num_block_m = tl.cdiv(seqlen_q, BLOCK_M)
-    for start_m in range(begin_m, num_block_m * BLOCK_M, BLOCK_M):
-        start_m = tl.multiple_of(start_m, BLOCK_M)
-        offs_m_curr = start_m + offs_m
-        if EVEN_M & EVEN_HEADDIM:
-            q = tl.load(q_ptrs)
-        elif EVEN_HEADDIM:
-            q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)
-        else:
-            q = tl.load(q_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0)
-        qk = tl.dot(q, k, trans_b=True)
-        if not EVEN_N:
-            qk = tl.where(offs_n[None, :] < seqlen_k, qk, float("-inf"))
-        if IS_CAUSAL:
-            qk = tl.where(offs_m_curr[:, None] >= offs_n[None, :], qk, float("-inf"))
-        if BIAS_TYPE != "none":
-            tl.debug_barrier()
-            if BIAS_TYPE == "vector":
-                if EVEN_N:
-                    bias = tl.load(b_ptrs).to(tl.float32)
-                else:
-                    bias = tl.load(b_ptrs, mask=offs_n < seqlen_k, other=0.0).to(tl.float32)
-                bias = bias[None, :]
-            elif BIAS_TYPE == "matrix":
-                if EVEN_M & EVEN_N:
-                    bias = tl.load(b_ptrs).to(tl.float32)
-                else:
-                    bias = tl.load(b_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_n[None, :] < seqlen_k), other=0.0).to(tl.float32)
-            qk = qk * softmax_scale + bias
-        if not EVEN_M & EVEN_HEADDIM:
-            tl.debug_barrier()
-        lse_i = tl.load(LSE + offs_m_curr)
-        if BIAS_TYPE == "none":
-            p = tl.exp(qk * softmax_scale - lse_i[:, None])
-        else:
-            p = tl.exp(qk - lse_i[:, None])
-        if EVEN_M & EVEN_HEADDIM:
-            do = tl.load(do_ptrs)
-        else:
-            do = tl.load(do_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0)
-        dv += tl.dot(p.to(do.dtype), do, trans_a=True)
-        if not EVEN_M & EVEN_HEADDIM:
-            tl.debug_barrier()
-        dp = tl.dot(do, v, trans_b=True)
-        if not EVEN_HEADDIM:
-            tl.debug_barrier()
-        Di = tl.load(D + offs_m_curr)
-        ds = (p * (dp - Di[:, None]) * softmax_scale).to(q.dtype)
-        dk += tl.dot(ds, q, trans_a=True)
-        if not EVEN_M & EVEN_HEADDIM:
-            tl.debug_barrier()
-        if not ATOMIC_ADD:
-            if EVEN_M & EVEN_HEADDIM:
-                dq = tl.load(dq_ptrs, eviction_policy="evict_last")
-                dq += tl.dot(ds, k)
-                tl.store(dq_ptrs, dq, eviction_policy="evict_last")
-            elif EVEN_HEADDIM:
-                dq = tl.load(dq_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0, eviction_policy="evict_last")
-                dq += tl.dot(ds, k)
-                tl.store(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q, eviction_policy="evict_last")
-            else:
-                dq = tl.load(dq_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0, eviction_policy="evict_last")
-                dq += tl.dot(ds, k)
-                tl.store(dq_ptrs, dq, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), eviction_policy="evict_last")
-        else:
-            dq = tl.dot(ds, k)
-            if EVEN_M & EVEN_HEADDIM:
-                tl.atomic_add(dq_ptrs, dq)
-            elif EVEN_HEADDIM:
-                tl.atomic_add(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q)
-            else:
-                tl.atomic_add(dq_ptrs, dq, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim))
-        dq_ptrs += BLOCK_M * stride_dqm
-        q_ptrs += BLOCK_M * stride_qm
-        do_ptrs += BLOCK_M * stride_dom
-        if BIAS_TYPE == "matrix":
-            b_ptrs += BLOCK_M * stride_bm
-    dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
-    dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
-    _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM)
-
-
-def init_to_zero(name):
-    return lambda nargs: nargs[name].zero_()
-
-
-@triton.autotune(
-    configs=[
-        triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "SEQUENCE_PARALLEL": False}, num_warps=8, num_stages=1, pre_hook=init_to_zero("DQ")),
-        triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "SEQUENCE_PARALLEL": True}, num_warps=8, num_stages=1, pre_hook=init_to_zero("DQ")),
-    ],
-    key=["CACHE_KEY_SEQLEN_Q", "CACHE_KEY_SEQLEN_K", "BIAS_TYPE", "IS_CAUSAL", "BLOCK_HEADDIM"],
-)
-@triton.heuristics(
-    {
-        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
-        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
-        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
-    }
-)
-@triton.jit
-def _bwd_kernel(
-    Q,
-    K,
-    V,
-    Bias,
-    DO,
-    DQ,
-    DK,
-    DV,
-    LSE,
-    D,
-    softmax_scale,
-    stride_qb,
-    stride_qh,
-    stride_qm,
-    stride_kb,
-    stride_kh,
-    stride_kn,
-    stride_vb,
-    stride_vh,
-    stride_vn,
-    stride_bb,
-    stride_bh,
-    stride_bm,
-    stride_dob,
-    stride_doh,
-    stride_dom,
-    stride_dqb,
-    stride_dqh,
-    stride_dqm,
-    stride_dkb,
-    stride_dkh,
-    stride_dkn,
-    stride_dvb,
-    stride_dvh,
-    stride_dvn,
-    nheads,
-    seqlen_q,
-    seqlen_k,
-    seqlen_q_rounded,
-    headdim,
-    CACHE_KEY_SEQLEN_Q,
-    CACHE_KEY_SEQLEN_K,
-    BIAS_TYPE: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_HEADDIM: tl.constexpr,
-    SEQUENCE_PARALLEL: tl.constexpr,
-    EVEN_M: tl.constexpr,
-    EVEN_N: tl.constexpr,
-    EVEN_HEADDIM: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    off_hb = tl.program_id(1)
-    off_b = off_hb // nheads
-    off_h = off_hb % nheads
-    Q += off_b * stride_qb + off_h * stride_qh
-    K += off_b * stride_kb + off_h * stride_kh
-    V += off_b * stride_vb + off_h * stride_vh
-    DO += off_b * stride_dob + off_h * stride_doh
-    DQ += off_b * stride_dqb + off_h * stride_dqh
-    DK += off_b * stride_dkb + off_h * stride_dkh
-    DV += off_b * stride_dvb + off_h * stride_dvh
-    if BIAS_TYPE != "none":
-        Bias += off_b * stride_bb + off_h * stride_bh
-    D += off_hb * seqlen_q_rounded
-    LSE += off_hb * seqlen_q_rounded
-    if not SEQUENCE_PARALLEL:
-        num_block_n = tl.cdiv(seqlen_k, BLOCK_N)
-        for start_n in range(0, num_block_n):
-            _bwd_kernel_one_col_block(
-                start_n,
-                Q,
-                K,
-                V,
-                Bias,
-                DO,
-                DQ,
-                DK,
-                DV,
-                LSE,
-                D,
-                softmax_scale,
-                stride_qm,
-                stride_kn,
-                stride_vn,
-                stride_bm,
-                stride_dom,
-                stride_dqm,
-                stride_dkn,
-                stride_dvn,
-                seqlen_q,
-                seqlen_k,
-                headdim,
-                ATOMIC_ADD=False,
-                BIAS_TYPE=BIAS_TYPE,
-                IS_CAUSAL=IS_CAUSAL,
-                BLOCK_HEADDIM=BLOCK_HEADDIM,
-                EVEN_M=EVEN_M,
-                EVEN_N=EVEN_N,
-                EVEN_HEADDIM=EVEN_HEADDIM,
-                BLOCK_M=BLOCK_M,
-                BLOCK_N=BLOCK_N,
-            )
-    else:
-        start_n = tl.program_id(0)
-        _bwd_kernel_one_col_block(
-            start_n,
-            Q,
-            K,
-            V,
-            Bias,
-            DO,
-            DQ,
-            DK,
-            DV,
-            LSE,
-            D,
-            softmax_scale,
-            stride_qm,
-            stride_kn,
-            stride_vn,
-            stride_bm,
-            stride_dom,
-            stride_dqm,
-            stride_dkn,
-            stride_dvn,
-            seqlen_q,
-            seqlen_k,
-            headdim,
-            ATOMIC_ADD=True,
-            BIAS_TYPE=BIAS_TYPE,
-            IS_CAUSAL=IS_CAUSAL,
-            BLOCK_HEADDIM=BLOCK_HEADDIM,
-            EVEN_M=EVEN_M,
-            EVEN_N=EVEN_N,
-            EVEN_HEADDIM=EVEN_HEADDIM,
-            BLOCK_M=BLOCK_M,
-            BLOCK_N=BLOCK_N,
-        )
-
-
-def _flash_attn_forward(q, k, v, bias=None, causal=False, softmax_scale=None):
-    (batch, seqlen_q, nheads, d) = q.shape
-    (_, seqlen_k, _, _) = k.shape
-    assert k.shape == (batch, seqlen_k, nheads, d)
-    assert v.shape == (batch, seqlen_k, nheads, d)
-    assert d <= 128, "FlashAttention only support head dimensions up to 128"
-    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same type"
-    assert q.dtype in [torch.float16, torch.bfloat16], "Only support fp16 and bf16"
-    assert q.is_cuda and k.is_cuda and v.is_cuda
-    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
-    has_bias = bias is not None
-    bias_type = "none"
-    if has_bias:
-        assert bias.dtype in [q.dtype, torch.float]
-        assert bias.is_cuda
-        assert bias.dim() == 4
-        if bias.stride(-1) != 1:
-            bias = bias.contiguous()
-        if bias.shape[2:] == (1, seqlen_k):
-            bias_type = "vector"
-        elif bias.shape[2:] == (seqlen_q, seqlen_k):
-            bias_type = "matrix"
-        else:
-            raise RuntimeError("Last 2 dimensions of bias must be (1, seqlen_k) or (seqlen_q, seqlen_k)")
-        bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
-    bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
-    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
-    lse = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
-    tmp = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
-    o = torch.empty_like(q)
-    BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
-    BLOCK = 128
-    num_warps = 4 if d <= 64 else 8
-    grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads)
-    _fwd_kernel[grid](
-        q,
-        k,
-        v,
-        bias,
-        o,
-        lse,
-        tmp,
-        softmax_scale,
-        q.stride(0),
-        q.stride(2),
-        q.stride(1),
-        k.stride(0),
-        k.stride(2),
-        k.stride(1),
-        v.stride(0),
-        v.stride(2),
-        v.stride(1),
-        *bias_strides,
-        o.stride(0),
-        o.stride(2),
-        o.stride(1),
-        nheads,
-        seqlen_q,
-        seqlen_k,
-        seqlen_q_rounded,
-        d,
-        seqlen_q // 32,
-        seqlen_k // 32,
-        bias_type,
-        causal,
-        BLOCK_HEADDIM,
-        BLOCK_M=BLOCK,
-        BLOCK_N=BLOCK,
-        num_warps=num_warps,
-        num_stages=1
-    )
-    return (o, lse, softmax_scale)
-
-
-def _flash_attn_backward(do, q, k, v, o, lse, dq, dk, dv, bias=None, causal=False, softmax_scale=None):
-    if do.stride(-1) != 1:
-        do = do.contiguous()
-    (batch, seqlen_q, nheads, d) = q.shape
-    (_, seqlen_k, _, _) = k.shape
-    assert d <= 128
-    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
-    assert lse.shape == (batch, nheads, seqlen_q_rounded)
-    assert q.stride(-1) == k.stride(-1) == v.stride(-1) == o.stride(-1) == 1
-    assert dq.stride(-1) == dk.stride(-1) == dv.stride(-1) == 1
-    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
-    dq_accum = torch.empty_like(q, dtype=torch.float32)
-    delta = torch.empty_like(lse)
-    BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
-    grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads)
-    _bwd_preprocess_do_o_dot[grid](
-        o,
-        do,
-        delta,
-        o.stride(0),
-        o.stride(2),
-        o.stride(1),
-        do.stride(0),
-        do.stride(2),
-        do.stride(1),
-        nheads,
-        seqlen_q,
-        seqlen_q_rounded,
-        d,
-        BLOCK_M=128,
-        BLOCK_HEADDIM=BLOCK_HEADDIM,
-    )
-    has_bias = bias is not None
-    bias_type = "none"
-    if has_bias:
-        assert bias.dtype in [q.dtype, torch.float]
-        assert bias.is_cuda
-        assert bias.dim() == 4
-        assert bias.stride(-1) == 1
-        if bias.shape[2:] == (1, seqlen_k):
-            bias_type = "vector"
-        elif bias.shape[2:] == (seqlen_q, seqlen_k):
-            bias_type = "matrix"
-        else:
-            raise RuntimeError("Last 2 dimensions of bias must be (1, seqlen_k) or (seqlen_q, seqlen_k)")
-        bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
-    bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
-    grid = lambda META: (triton.cdiv(seqlen_k, META["BLOCK_N"]) if META["SEQUENCE_PARALLEL"] else 1, batch * nheads)
-    _bwd_kernel[grid](
-        q,
-        k,
-        v,
-        bias,
-        do,
-        dq_accum,
-        dk,
-        dv,
-        lse,
-        delta,
-        softmax_scale,
-        q.stride(0),
-        q.stride(2),
-        q.stride(1),
-        k.stride(0),
-        k.stride(2),
-        k.stride(1),
-        v.stride(0),
-        v.stride(2),
-        v.stride(1),
-        *bias_strides,
-        do.stride(0),
-        do.stride(2),
-        do.stride(1),
-        dq_accum.stride(0),
-        dq_accum.stride(2),
-        dq_accum.stride(1),
-        dk.stride(0),
-        dk.stride(2),
-        dk.stride(1),
-        dv.stride(0),
-        dv.stride(2),
-        dv.stride(1),
-        nheads,
-        seqlen_q,
-        seqlen_k,
-        seqlen_q_rounded,
-        d,
-        seqlen_q // 32,
-        seqlen_k // 32,
-        bias_type,
-        causal,
-        BLOCK_HEADDIM
-    )
-    dq.copy_(dq_accum)
-
-
-class FlashAttnQKVPackedFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, qkv, bias=None, causal=False, softmax_scale=None):
-        """
-        qkv: (batch, seqlen, 3, nheads, headdim)
-        bias: optional, shape broadcastible to (batch, nheads, seqlen, seqlen).
-            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen).
-            ALiBi mask for non-causal would have shape (1, nheads, seqlen, seqlen)
-        """
-        if qkv.stride(-1) != 1:
-            qkv = qkv.contiguous()
-        (o, lse, ctx.softmax_scale) = _flash_attn_forward(qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2], bias=bias, causal=causal, softmax_scale=softmax_scale)
-        ctx.save_for_backward(qkv, o, lse, bias)
-        ctx.causal = causal
-        return o
-
-    @staticmethod
-    def backward(ctx, do):
-        (qkv, o, lse, bias) = ctx.saved_tensors
-        assert not ctx.needs_input_grad[1], "FlashAttention does not support bias gradient yet"
-        with torch.inference_mode():
-            dqkv = torch.empty_like(qkv)
-            _flash_attn_backward(
-                do,
-                qkv[:, :, 0],
-                qkv[:, :, 1],
-                qkv[:, :, 2],
-                o,
-                lse,
-                dqkv[:, :, 0],
-                dqkv[:, :, 1],
-                dqkv[:, :, 2],
-                bias=bias,
-                causal=ctx.causal,
-                softmax_scale=ctx.softmax_scale,
-            )
-        return (dqkv, None, None, None)
-
-
-flash_attn_qkvpacked_func = FlashAttnQKVPackedFunc.apply
-
-
-class FlashAttnKVPackedFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, q, kv, bias=None, causal=False, softmax_scale=None):
-        """
-        q: (batch, seqlen_q, nheads, headdim)
-        kv: (batch, seqlen_k, 2, nheads, headdim)
-        bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
-            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
-            ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
-        """
-        (q, kv) = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, kv]]
-        (o, lse, ctx.softmax_scale) = _flash_attn_forward(q, kv[:, :, 0], kv[:, :, 1], bias=bias, causal=causal, softmax_scale=softmax_scale)
-        ctx.save_for_backward(q, kv, o, lse, bias)
-        ctx.causal = causal
-        return o
-
-    @staticmethod
-    def backward(ctx, do):
-        (q, kv, o, lse, bias) = ctx.saved_tensors
-        if len(ctx.needs_input_grad) >= 3:
-            assert not ctx.needs_input_grad[2], "FlashAttention does not support bias gradient yet"
-        with torch.inference_mode():
-            dq = torch.empty_like(q)
-            dkv = torch.empty_like(kv)
-            _flash_attn_backward(
-                do, q, kv[:, :, 0], kv[:, :, 1], o, lse, dq, dkv[:, :, 0], dkv[:, :, 1], bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale
-            )
-        return (dq, dkv, None, None, None)
-
-
-flash_attn_kvpacked_func = FlashAttnKVPackedFunc.apply
-
-
-class FlashAttnFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, q, k, v, bias=None, causal=False, softmax_scale=None):
-        """
-        q: (batch_size, seqlen_q, nheads, headdim)
-        k, v: (batch_size, seqlen_k, nheads, headdim)
-        bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
-            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
-            ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
-        """
-        (q, k, v) = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, k, v]]
-        (o, lse, ctx.softmax_scale) = _flash_attn_forward(q, k, v, bias=bias, causal=causal, softmax_scale=softmax_scale)
-        ctx.save_for_backward(q, k, v, o, lse, bias)
-        ctx.causal = causal
-        return o
-
-    @staticmethod
-    def backward(ctx, do):
-        (q, k, v, o, lse, bias) = ctx.saved_tensors
-        assert not ctx.needs_input_grad[3], "FlashAttention does not support bias gradient yet"
-        with torch.inference_mode():
-            dq = torch.empty_like(q)
-            dk = torch.empty_like(k)
-            dv = torch.empty_like(v)
-            _flash_attn_backward(do, q, k, v, o, lse, dq, dk, dv, bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale)
-        return (dq, dk, dv, None, None, None)
-
-
-flash_attn_func = FlashAttnFunc.apply
diff --git a/mllm/flamingo/mpt/hf_prefixlm_converter.py b/mllm/flamingo/mpt/hf_prefixlm_converter.py
deleted file mode 100644
index ea544cccbddace12ec9d65a199e9f73099247ba4..0000000000000000000000000000000000000000
--- a/mllm/flamingo/mpt/hf_prefixlm_converter.py
+++ /dev/null
@@ -1,575 +0,0 @@
-"""Converts Huggingface Causal LM to Prefix LM.
-
-Conversion does lightweight surgery on a HuggingFace
-Causal LM to convert it to a Prefix LM.
-
-Prefix LMs accepts a `bidirectional_mask` input in `forward`
-and treat the input prompt as the prefix in `generate`.
-"""
-import math
-import warnings
-from types import MethodType
-from typing import Any, Dict, List, Optional, Tuple, Union
-import torch
-from transformers.models.bloom.modeling_bloom import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    BloomForCausalLM,
-    BloomModel,
-    CausalLMOutputWithCrossAttentions,
-    CrossEntropyLoss,
-)
-from transformers.models.bloom.modeling_bloom import _expand_mask as _expand_mask_bloom
-from transformers.models.bloom.modeling_bloom import _make_causal_mask as _make_causal_mask_bloom
-from transformers.models.bloom.modeling_bloom import logging
-from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
-from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoForCausalLM
-from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM
-from transformers.models.gptj.modeling_gptj import GPTJForCausalLM
-from transformers.models.opt.modeling_opt import OPTForCausalLM
-from transformers.models.opt.modeling_opt import _expand_mask as _expand_mask_opt
-from transformers.models.opt.modeling_opt import _make_causal_mask as _make_causal_mask_opt
-
-logger = logging.get_logger(__name__)
-_SUPPORTED_GPT_MODELS = (GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM)
-CAUSAL_GPT_TYPES = Union[GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM]
-
-
-def _convert_gpt_causal_lm_to_prefix_lm(model: CAUSAL_GPT_TYPES) -> CAUSAL_GPT_TYPES:
-    """Converts a GPT-style Causal LM to a Prefix LM.
-
-    Supported HuggingFace model classes:
-        - `GPT2LMHeadModel`
-        - `GPTNeoForCausalLM`
-        - `GPTNeoXForCausalLM`
-        - `GPTJForCausalLM`
-
-    See `convert_hf_causal_lm_to_prefix_lm` for more details.
-    """
-    if hasattr(model, "_prefix_lm_converted"):
-        return model
-    assert isinstance(model, _SUPPORTED_GPT_MODELS)
-    assert model.config.add_cross_attention == False, "Only supports GPT-style decoder-only models"
-
-    def _get_attn_modules(model: CAUSAL_GPT_TYPES) -> List[torch.nn.Module]:
-        """Helper that gets a list of the model's attention modules.
-
-        Each module has a `bias` buffer used for causal masking. The Prefix LM
-        conversion adds logic to dynamically manipulate these biases to support
-        Prefix LM attention masking.
-        """
-        attn_modules = []
-        if isinstance(model, GPTNeoXForCausalLM):
-            blocks = model.gpt_neox.layers
-        else:
-            blocks = model.transformer.h
-        for block in blocks:
-            if isinstance(model, GPTNeoForCausalLM):
-                if block.attn.attention_type != "global":
-                    continue
-                attn_module = block.attn.attention
-            elif isinstance(model, GPTNeoXForCausalLM):
-                attn_module = block.attention
-            else:
-                attn_module = block.attn
-            attn_modules.append(attn_module)
-        return attn_modules
-
-    setattr(model, "_original_forward", getattr(model, "forward"))
-    setattr(model, "_original_generate", getattr(model, "generate"))
-
-    def forward(
-        self: CAUSAL_GPT_TYPES,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        bidirectional_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        """Wraps original forward to enable PrefixLM attention."""
-
-        def call_og_forward():
-            if isinstance(self, GPTNeoXForCausalLM):
-                return self._original_forward(
-                    input_ids=input_ids,
-                    past_key_values=past_key_values,
-                    attention_mask=attention_mask,
-                    head_mask=head_mask,
-                    inputs_embeds=inputs_embeds,
-                    labels=labels,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                    output_hidden_states=output_hidden_states,
-                    return_dict=return_dict,
-                )
-            else:
-                return self._original_forward(
-                    input_ids=input_ids,
-                    past_key_values=past_key_values,
-                    attention_mask=attention_mask,
-                    token_type_ids=token_type_ids,
-                    position_ids=position_ids,
-                    head_mask=head_mask,
-                    inputs_embeds=inputs_embeds,
-                    labels=labels,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                    output_hidden_states=output_hidden_states,
-                    return_dict=return_dict,
-                )
-
-        if bidirectional_mask is None:
-            return call_og_forward()
-        assert isinstance(bidirectional_mask, torch.Tensor)
-        attn_modules = _get_attn_modules(model)
-        (b, s) = bidirectional_mask.shape
-        max_length = attn_modules[0].bias.shape[-1]
-        if s > max_length:
-            raise ValueError(f"bidirectional_mask sequence length (={s}) exceeds the " + f"max length allowed by the model ({max_length}).")
-        assert s <= max_length
-        if s < max_length:
-            pad = torch.zeros((int(b), int(max_length - s)), dtype=bidirectional_mask.dtype, device=bidirectional_mask.device)
-            bidirectional_mask = torch.cat([bidirectional_mask, pad], dim=1)
-        bidirectional = bidirectional_mask.unsqueeze(1).unsqueeze(1)
-        for attn_module in attn_modules:
-            attn_module.bias.data = torch.logical_or(attn_module.bias.data, bidirectional)
-        output = call_og_forward()
-        for attn_module in attn_modules:
-            attn_module.bias.data = torch.tril(attn_module.bias.data[0, 0])[None, None]
-        return output
-
-    def generate(self: CAUSAL_GPT_TYPES, *args: tuple, **kwargs: Dict[str, Any]):
-        """Wraps original generate to enable PrefixLM attention."""
-        attn_modules = _get_attn_modules(model)
-        for attn_module in attn_modules:
-            attn_module.bias.data[:] = 1
-        output = self._original_generate(*args, **kwargs)
-        for attn_module in attn_modules:
-            attn_module.bias.data = torch.tril(attn_module.bias.data[0, 0])[None, None]
-        return output
-
-    setattr(model, "forward", MethodType(forward, model))
-    setattr(model, "generate", MethodType(generate, model))
-    setattr(model, "_prefix_lm_converted", True)
-    return model
-
-
-def _convert_bloom_causal_lm_to_prefix_lm(model: BloomForCausalLM) -> BloomForCausalLM:
-    """Converts a BLOOM Causal LM to a Prefix LM.
-
-    Supported HuggingFace model classes:
-        - `BloomForCausalLM`
-
-    See `convert_hf_causal_lm_to_prefix_lm` for more details.
-    """
-    if hasattr(model, "_prefix_lm_converted"):
-        return model
-    assert isinstance(model, BloomForCausalLM)
-    assert model.config.add_cross_attention == False, "Only supports BLOOM decoder-only models"
-
-    def _prepare_attn_mask(
-        self: BloomModel, attention_mask: torch.Tensor, bidirectional_mask: Optional[torch.Tensor], input_shape: Tuple[int, int], past_key_values_length: int
-    ) -> torch.BoolTensor:
-        combined_attention_mask = None
-        device = attention_mask.device
-        (_, src_length) = input_shape
-        if src_length > 1:
-            combined_attention_mask = _make_causal_mask_bloom(input_shape, device=device, past_key_values_length=past_key_values_length)
-            if bidirectional_mask is not None:
-                assert attention_mask.shape == bidirectional_mask.shape
-                expanded_bidirectional_mask = _expand_mask_bloom(bidirectional_mask, tgt_length=src_length)
-                combined_attention_mask = torch.logical_and(combined_attention_mask, expanded_bidirectional_mask)
-        expanded_attn_mask = _expand_mask_bloom(attention_mask, tgt_length=src_length)
-        combined_attention_mask = expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask
-        return combined_attention_mask
-
-    def _build_alibi_tensor(self: BloomModel, batch_size: int, query_length: int, key_length: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
-        num_heads = self.config.n_head
-        closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
-        base = torch.tensor(2 ** (-(2 ** (-(math.log2(closest_power_of_2) - 3)))), device=device, dtype=torch.float32)
-        powers = torch.arange(1, 1 + closest_power_of_2, device=device, dtype=torch.int32)
-        slopes = torch.pow(base, powers)
-        if closest_power_of_2 != num_heads:
-            extra_base = torch.tensor(2 ** (-(2 ** (-(math.log2(2 * closest_power_of_2) - 3)))), device=device, dtype=torch.float32)
-            num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
-            extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=device, dtype=torch.int32)
-            slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
-        qa = torch.arange(query_length, device=device, dtype=torch.int32).view(-1, 1)
-        ka = torch.arange(key_length, device=device, dtype=torch.int32).view(1, -1)
-        diffs = qa - ka + key_length - query_length
-        diffs = -diffs.abs()
-        alibi = slopes.view(1, num_heads, 1, 1) * diffs.view(1, 1, query_length, key_length)
-        alibi = alibi.expand(batch_size, -1, -1, -1).reshape(-1, query_length, key_length)
-        return alibi.to(dtype)
-
-    KeyValueT = Tuple[torch.Tensor, torch.Tensor]
-
-    def forward(
-        self: BloomModel,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[KeyValueT, ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        bidirectional_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
-        if deprecated_arguments.pop("position_ids", False) is not False:
-            warnings.warn(
-                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. " + "You can safely ignore passing `position_ids`.", FutureWarning
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            (batch_size, seq_length) = input_ids.shape
-        elif inputs_embeds is not None:
-            (batch_size, seq_length, _) = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-        if past_key_values is None:
-            past_key_values = tuple([None] * len(self.h))
-        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        hidden_states = self.word_embeddings_layernorm(inputs_embeds)
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-        if past_key_values[0] is not None:
-            tmp = past_key_values[0][0]
-            past_key_values_length = tmp.shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-        if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device)
-        else:
-            attention_mask = attention_mask.to(hidden_states.device)
-        alibi = self._build_alibi_tensor(
-            batch_size=batch_size, query_length=seq_length, key_length=seq_length_with_past, dtype=hidden_states.dtype, device=hidden_states.device
-        )
-        causal_mask = self._prepare_attn_mask(
-            attention_mask, bidirectional_mask, input_shape=(batch_size, seq_length), past_key_values_length=past_key_values_length
-        )
-        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-            if output_hidden_states:
-                hst = (hidden_states,)
-                all_hidden_states = all_hidden_states + hst
-            if self.gradient_checkpointing and self.training:
-                if use_cache:
-                    logger.warning("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
-                    use_cache = False
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs, use_cache=use_cache, output_attentions=output_attentions)
-
-                    return custom_forward
-
-                outputs = torch.utils.checkpoint.checkpoint(create_custom_forward(block), hidden_states, alibi, causal_mask, head_mask[i])
-            else:
-                outputs = block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=causal_mask,
-                    head_mask=head_mask[i],
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                    alibi=alibi,
-                )
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-            if output_attentions:
-                oa = (outputs[2 if use_cache else 1],)
-                all_self_attentions = all_self_attentions + oa
-        hidden_states = self.ln_f(hidden_states)
-        if output_hidden_states:
-            hst = (hidden_states,)
-            all_hidden_states = all_hidden_states + hst
-        if not return_dict:
-            return tuple((v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None))
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_self_attentions
-        )
-
-    setattr(model.transformer, "_prepare_attn_mask", MethodType(_prepare_attn_mask, model.transformer))
-    setattr(model.transformer, "_build_alibi_tensor", MethodType(_build_alibi_tensor, model.transformer))
-    setattr(model.transformer, "forward", MethodType(forward, model.transformer))
-    KeyValueT = Tuple[torch.Tensor, torch.Tensor]
-
-    def forward(
-        self: BloomForCausalLM,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[KeyValueT, ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        bidirectional_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
-        """Replacement forward method for BloomCausalLM."""
-        if deprecated_arguments.pop("position_ids", False) is not False:
-            warnings.warn(
-                "`position_ids` have no functionality in BLOOM and will be removed " + "in v5.0.0. You can safely ignore passing `position_ids`.", FutureWarning
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            bidirectional_mask=bidirectional_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        lm_logits = self.lm_head(hidden_states)
-        loss = None
-        if labels is not None:
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            (batch_size, seq_length, vocab_size) = shift_logits.shape
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length))
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return (loss,) + output if loss is not None else output
-        return CausalLMOutputWithCrossAttentions(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self: BloomForCausalLM, input_ids: torch.LongTensor, past: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, **kwargs
-    ) -> dict:
-        if past:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-            bidirectional_mask = None
-            if past[0][0].shape[0] == input_ids.shape[0]:
-                past = self._convert_to_bloom_cache(past)
-        else:
-            bidirectional_mask = torch.ones_like(input_ids)
-        return {"input_ids": input_ids, "past_key_values": past, "use_cache": True, "attention_mask": attention_mask, "bidirectional_mask": bidirectional_mask}
-
-    setattr(model, "forward", MethodType(forward, model))
-    setattr(model, "prepare_inputs_for_generation", MethodType(prepare_inputs_for_generation, model))
-    setattr(model, "_prefix_lm_converted", True)
-    return model
-
-
-def _convert_opt_causal_lm_to_prefix_lm(model: OPTForCausalLM) -> OPTForCausalLM:
-    """Converts an OPT Causal LM to a Prefix LM.
-
-    Supported HuggingFace model classes:
-        - `OPTForCausalLM`
-
-    See `convert_hf_causal_lm_to_prefix_lm` for more details.
-    """
-    if hasattr(model, "_prefix_lm_converted"):
-        return model
-    assert isinstance(model, OPTForCausalLM)
-    assert model.config.add_cross_attention == False, "Only supports OPT decoder-only models"
-    setattr(model, "_original_forward", getattr(model, "forward"))
-    setattr(model, "_original_generate", getattr(model, "generate"))
-    model.model.decoder.bidirectional_mask = None
-
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            if self.bidirectional_mask == "g":
-                (bsz, src_length) = input_shape
-                combined_attention_mask = torch.zeros(
-                    (bsz, 1, src_length, src_length + past_key_values_length), dtype=inputs_embeds.dtype, device=inputs_embeds.device
-                )
-            else:
-                combined_attention_mask = _make_causal_mask_opt(input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length).to(
-                    inputs_embeds.device
-                )
-                if self.bidirectional_mask is not None:
-                    assert attention_mask.shape == self.bidirectional_mask.shape
-                    expanded_bidirectional_mask = _expand_mask_opt(self.bidirectional_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-                        inputs_embeds.device
-                    )
-                    combined_attention_mask = torch.maximum(expanded_bidirectional_mask, combined_attention_mask)
-        if attention_mask is not None:
-            expanded_attn_mask = _expand_mask_opt(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(inputs_embeds.device)
-            combined_attention_mask = expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
-        return combined_attention_mask
-
-    setattr(model.model.decoder, "_prepare_decoder_attention_mask", MethodType(_prepare_decoder_attention_mask, model.model.decoder))
-
-    def forward(
-        self: OPTForCausalLM,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        bidirectional_mask: Optional[torch.ByteTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        def call_og_forward():
-            return self._original_forward(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                head_mask=head_mask,
-                past_key_values=past_key_values,
-                inputs_embeds=inputs_embeds,
-                labels=labels,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-
-        if bidirectional_mask is None:
-            return call_og_forward()
-        self.model.decoder.bidirectional_mask = bidirectional_mask
-        try:
-            outputs = call_og_forward()
-        except:
-            self.model.decoder.bidirectional_mask = None
-            raise
-        self.model.decoder.bidirectional_mask = None
-        return outputs
-
-    def generate(self: OPTForCausalLM, *args: tuple, **kwargs: Dict[str, Any]):
-        """Wraps original generate to enable PrefixLM-style attention."""
-        self.model.decoder.bidirectional_mask = "g"
-        try:
-            output = self._original_generate(*args, **kwargs)
-        except:
-            self.model.decoder.bidirectional_mask = None
-            raise
-        self.model.decoder.bidirectional_mask = None
-        return output
-
-    setattr(model, "forward", MethodType(forward, model))
-    setattr(model, "generate", MethodType(generate, model))
-    setattr(model, "_prefix_lm_converted", True)
-    return model
-
-
-_SUPPORTED_HF_MODELS = _SUPPORTED_GPT_MODELS + (BloomForCausalLM, OPTForCausalLM)
-CAUSAL_LM_TYPES = Union[GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM, BloomForCausalLM, OPTForCausalLM]
-
-
-def convert_hf_causal_lm_to_prefix_lm(model: CAUSAL_LM_TYPES) -> CAUSAL_LM_TYPES:
-    """Converts a HuggingFace Causal LM to a Prefix LM.
-
-    Supported HuggingFace model classes:
-        - `GPT2LMHeadModel`
-        - `GPTNeoForCausalLM`
-        - `GPTNeoXForCausalLM`
-        - `GPTJForCausalLM`
-        - `BloomForCausalLM`
-        - `OPTForCausalLM`
-
-    Conversion to a Prefix LM is done by modifying the `forward` method, and possibly also the
-    `generate` method and/or select underlying methods depending on the model class.
-
-    These changes preserve the model API, but add a new input to `forward`: "bidirectional_mask".
-
-    Notes on training:
-        To actually train the converted model as a Prefix LM, training batches will need to indicate
-        the prefix/target structure by including `bidirectional_mask` as part of the batch inputs.
-
-        **This is not a standard input and requires custom layers either within or after your dataloader.**
-
-        In addition to adding `bidirectional_mask` to the batch, this custom code should modify `labels`
-        such that `batch['labels'][batch['bidirectional_mask'] == 1] == -100`.
-        That is, the prefix portion of the sequence should not generate any loss. Loss should only be
-        generated by the target portion of the sequence.
-
-    Notes on `GPTNeoForCausalLM`:
-        To simplify the implementation, "global" and "local" attention layers are handled differently.
-        For "global" layers, we handle conversion as described above. For "local" layers, which use a
-        causal attention mask within a restricted local window, we do not alter the masking.
-
-    Notes on `forward` method conversion:
-        After conversion, the `forward` method will handle a new input, `bidirectional_mask`,
-        which should be a [batch_size, seq_length] byte tensor, where 1 indicates token positions
-        belonging to the prefix (prefix tokens can attend to one another bidirectionally), and
-        0 indicates token positions belonging to the target.
-
-        The new `forward` method will incorporate `bidirectional_mask` (if supplied) into the existing
-        causal mask, call the original `forward` method, and (if the causal mask is a buffer) reset
-        the causal masks before returning the result.
-
-    Notes on `generate` method conversion:
-        After conversion, the `generate` method will have the same signature but will internally
-        convert all causal masks to be purely bidirectional, call the original `generate` method, and
-        (where appropriate) reset the causal masks before returning the result.
-
-        This works thanks to the logic of the HuggingFace `generate` API, which first encodes the token
-        "prompt" passed to `generate` (which is treated as the prefix) and then sequentially generates
-        each new token. Encodings are cached as generation happens, so all prefix tokens can attend to one
-        another (as expected in a Prefix LM) and generated tokens can only attend to prefix tokens and
-        previously-generated tokens (also as expected in a Prefix LM).
-
-    To preserve the API, the original methods are renamed to `_original_forward` and
-    `_original_generate`, and replaced with new `forward` and `generate` methods that wrap
-    them, respectively. Although implementation details vary by model class.
-    """
-    if isinstance(model, _SUPPORTED_GPT_MODELS):
-        return _convert_gpt_causal_lm_to_prefix_lm(model)
-    elif isinstance(model, BloomForCausalLM):
-        return _convert_bloom_causal_lm_to_prefix_lm(model)
-    elif isinstance(model, OPTForCausalLM):
-        return _convert_opt_causal_lm_to_prefix_lm(model)
-    else:
-        raise TypeError(f"Cannot convert model to Prefix LM. " + f"Model does not belong to set of supported HF models:" + f"\n{_SUPPORTED_HF_MODELS}")
-
-
-def add_bidirectional_mask_if_missing(batch: Dict[str, Any]):
-    """Attempts to add bidirectional_mask to batch if missing.
-
-    Raises:
-        KeyError if bidirectional_mask is missing and can't be inferred
-    """
-    if "bidirectional_mask" not in batch:
-        if batch.get("mode", None) == "icl_task":
-            batch["bidirectional_mask"] = batch["attention_mask"].clone()
-            for i, continuation_indices in enumerate(batch["continuation_indices"]):
-                batch["bidirectional_mask"][i, continuation_indices] = 0
-        elif "labels" in batch and "attention_mask" in batch:
-            batch["bidirectional_mask"] = torch.logical_and(torch.eq(batch["attention_mask"], 1), torch.eq(batch["labels"], -100)).type_as(
-                batch["attention_mask"]
-            )
-        else:
-            raise KeyError("No bidirectional_mask in batch and not sure how to construct one.")
diff --git a/mllm/flamingo/mpt/meta_init_context.py b/mllm/flamingo/mpt/meta_init_context.py
deleted file mode 100644
index df7ca692d71fe349f9571e500ca2cc386236aa98..0000000000000000000000000000000000000000
--- a/mllm/flamingo/mpt/meta_init_context.py
+++ /dev/null
@@ -1,98 +0,0 @@
-from contextlib import contextmanager
-import torch
-import torch.nn as nn
-
-
-@contextmanager
-def init_empty_weights(include_buffers: bool = False):
-    """Meta initialization context manager.
-
-    A context manager under which models are initialized with all parameters
-    on the meta device, therefore creating an empty model. Useful when just
-    initializing the model would blow the available RAM.
-
-    Args:
-        include_buffers (`bool`, *optional*, defaults to `False`): Whether or
-            not to also put all buffers on the meta device while initializing.
-
-    Example:
-    ```python
-    import torch.nn as nn
-
-    # Initialize a model with 100 billions parameters in no time and without using any RAM.
-    with init_empty_weights():
-        tst = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
-    ```
-
-    <Tip warning={true}>
-
-    Any model created under this context manager has no weights. As such you can't do something like
-    `model.to(some_device)` with it. To load weights inside your empty model, see [`load_checkpoint_and_dispatch`].
-
-    </Tip>
-    """
-    with init_on_device(torch.device("meta"), include_buffers=include_buffers) as f:
-        yield f
-
-
-@contextmanager
-def init_on_device(device: torch.device, include_buffers: bool = False):
-    """Device initialization context manager.
-
-    A context manager under which models are initialized with all parameters
-    on the specified device.
-
-    Args:
-        device (`torch.device`): Device to initialize all parameters on.
-        include_buffers (`bool`, *optional*, defaults to `False`): Whether or
-            not to also put all buffers on the meta device while initializing.
-
-    Example:
-    ```python
-    import torch.nn as nn
-
-    with init_on_device(device=torch.device("cuda")):
-        tst = nn.Liner(100, 100)  # on `cuda` device
-    ```
-    """
-    old_register_parameter = nn.Module.register_parameter
-    if include_buffers:
-        old_register_buffer = nn.Module.register_buffer
-
-    def register_empty_parameter(module, name, param):
-        old_register_parameter(module, name, param)
-        if param is not None:
-            param_cls = type(module._parameters[name])
-            kwargs = module._parameters[name].__dict__
-            module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
-
-    def register_empty_buffer(module, name, buffer):
-        old_register_buffer(module, name, buffer)
-        if buffer is not None:
-            module._buffers[name] = module._buffers[name].to(device)
-
-    if include_buffers:
-        tensor_constructors_to_patch = {torch_function_name: getattr(torch, torch_function_name) for torch_function_name in ["empty", "zeros", "ones", "full"]}
-    else:
-        tensor_constructors_to_patch = {}
-
-    def patch_tensor_constructor(fn):
-        def wrapper(*args, **kwargs):
-            kwargs["device"] = device
-            return fn(*args, **kwargs)
-
-        return wrapper
-
-    try:
-        nn.Module.register_parameter = register_empty_parameter
-        if include_buffers:
-            nn.Module.register_buffer = register_empty_buffer
-        for torch_function_name in tensor_constructors_to_patch.keys():
-            setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
-        yield
-    finally:
-        nn.Module.register_parameter = old_register_parameter
-        if include_buffers:
-            nn.Module.register_buffer = old_register_buffer
-        for torch_function_name, old_torch_function in tensor_constructors_to_patch.items():
-            setattr(torch, torch_function_name, old_torch_function)
diff --git a/mllm/flamingo/mpt/modeling_mpt.py b/mllm/flamingo/mpt/modeling_mpt.py
deleted file mode 100644
index 3a569edf8f1fcc325aa74c05b834ef02b6dbb68b..0000000000000000000000000000000000000000
--- a/mllm/flamingo/mpt/modeling_mpt.py
+++ /dev/null
@@ -1,496 +0,0 @@
-"""A simple, flexible implementation of a GPT model.
-
-Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
-"""
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-
-from .attention import attn_bias_shape, build_attn_bias
-from .blocks import MPTBlock
-from .configuration_mpt import MPTConfig
-from .custom_embedding import SharedEmbedding
-from .norm import NORM_CLASS_REGISTRY
-from .param_init_fns import MODEL_INIT_REGISTRY, generic_param_init_fn_
-
-import torch.distributed as dist
-
-try:
-    from .flash_attn_triton import flash_attn_func
-except:
-    pass
-Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
-
-
-class MPTPreTrainedModel(PreTrainedModel):
-    config_class = MPTConfig
-    base_model_prefix = "model"
-    _no_split_modules = ["MPTBlock"]
-
-
-class MPTModel(MPTPreTrainedModel):
-    def __init__(self, config: MPTConfig):
-        config._validate_config()
-        super().__init__(config)
-        self.attn_impl = config.attn_config["attn_impl"]
-        self.prefix_lm = config.attn_config["prefix_lm"]
-        self.attn_uses_sequence_id = config.attn_config["attn_uses_sequence_id"]
-        self.alibi = config.attn_config["alibi"]
-        self.alibi_bias_max = config.attn_config["alibi_bias_max"]
-        if config.init_device == "mixed":
-            if dist.get_local_rank() == 0:
-                config.init_device = "cpu"
-            else:
-                config.init_device = "meta"
-        if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
-            norm_options = " | ".join(NORM_CLASS_REGISTRY.keys())
-            raise NotImplementedError(f"Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).")
-        norm_class = NORM_CLASS_REGISTRY[config.norm_type.lower()]
-        self.embedding_fraction = config.embedding_fraction
-        self.wte = SharedEmbedding(config.vocab_size, config.d_model, device=config.init_device)
-        if not self.alibi:
-            self.wpe = torch.nn.Embedding(config.max_seq_len, config.d_model, device=config.init_device)
-        self.emb_drop = nn.Dropout(config.emb_pdrop)
-        self.blocks = nn.ModuleList([MPTBlock(device=config.init_device, **config.to_dict()) for _ in range(config.n_layers)])
-        self.norm_f = norm_class(config.d_model, device=config.init_device)
-        if config.init_device != "meta":
-            print(
-                f'You are using config.init_device={config.init_device!r}, but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.'
-            )
-            self.apply(self.param_init_fn)
-        self.is_causal = not self.prefix_lm
-        self._attn_bias_initialized = False
-        self.attn_bias = None
-        self.attn_bias_shape = attn_bias_shape(
-            self.attn_impl,
-            config.n_heads,
-            config.max_seq_len,
-            self.alibi,
-            prefix_lm=self.prefix_lm,
-            causal=self.is_causal,
-            use_sequence_id=self.attn_uses_sequence_id,
-        )
-        if config.no_bias:
-            for module in self.modules():
-                if hasattr(module, "bias") and isinstance(module.bias, nn.Parameter):
-                    if config.verbose:
-                        warnings.warn(f"Removing bias ({module.bias}) from {module}.")
-                    module.register_parameter("bias", None)
-        if config.verbose and config.verbose > 2:
-            print(self)
-        if "verbose" not in self.config.init_config:
-            self.config.init_config["verbose"] = self.config.verbose
-        if self.config.init_config["verbose"] > 1:
-            init_fn_name = self.config.init_config["name"]
-            warnings.warn(f"Using {init_fn_name} initialization.")
-
-    def get_input_embeddings(self):
-        return self.wte
-
-    def set_input_embeddings(self, value):
-        self.wte = value
-
-    @torch.no_grad()
-    def _attn_bias(
-        self,
-        device,
-        dtype,
-        attention_mask: Optional[torch.ByteTensor] = None,
-        prefix_mask: Optional[torch.ByteTensor] = None,
-        sequence_id: Optional[torch.LongTensor] = None,
-    ):
-        if not self._attn_bias_initialized:
-            if self.attn_bias_shape:
-                self.attn_bias = torch.zeros(self.attn_bias_shape, device=device, dtype=dtype)
-                self.attn_bias = build_attn_bias(
-                    self.attn_impl,
-                    self.attn_bias,
-                    self.config.n_heads,
-                    self.config.max_seq_len,
-                    causal=self.is_causal,
-                    alibi=self.alibi,
-                    alibi_bias_max=self.alibi_bias_max,
-                )
-            self._attn_bias_initialized = True
-        if self.attn_impl == "flash":
-            return (self.attn_bias, attention_mask)
-        if self.attn_bias is not None:
-            self.attn_bias = self.attn_bias.to(dtype=dtype, device=device)
-        attn_bias = self.attn_bias
-        if self.prefix_lm:
-            assert isinstance(attn_bias, torch.Tensor)
-            assert isinstance(prefix_mask, torch.Tensor)
-            attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask)
-        if self.attn_uses_sequence_id and sequence_id is not None:
-            assert isinstance(attn_bias, torch.Tensor)
-            attn_bias = self._apply_sequence_id(attn_bias, sequence_id)
-        if attention_mask is not None:
-            s_k = attention_mask.shape[-1]
-            if attn_bias is None:
-                attn_bias = torch.zeros((1, 1, 1, s_k), device=device, dtype=dtype)
-            else:
-                _s_k = max(0, attn_bias.size(-1) - s_k)
-                attn_bias = attn_bias[:, :, :, _s_k:]
-            if prefix_mask is not None and attention_mask.shape != prefix_mask.shape:
-                raise ValueError(f"attention_mask shape={attention_mask.shape} " + f"and prefix_mask shape={prefix_mask.shape} are not equal.")
-            min_val = torch.finfo(attn_bias.dtype).min
-            attn_bias = attn_bias.masked_fill(~attention_mask.view(-1, 1, 1, s_k), min_val)
-        return (attn_bias, None)
-
-    def _apply_prefix_mask(self, attn_bias: torch.Tensor, prefix_mask: torch.Tensor):
-        (s_k, s_q) = attn_bias.shape[-2:]
-        if s_k != self.config.max_seq_len or s_q != self.config.max_seq_len:
-            raise ValueError(
-                "attn_bias does not match the expected shape. "
-                + f"The last two dimensions should both be {self.config.max_length} "
-                + f"but are {s_k} and {s_q}."
-            )
-        seq_len = prefix_mask.shape[-1]
-        if seq_len > self.config.max_seq_len:
-            raise ValueError(f"prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}")
-        attn_bias = attn_bias[..., :seq_len, :seq_len]
-        causal = torch.tril(torch.ones((seq_len, seq_len), dtype=torch.bool, device=prefix_mask.device)).view(1, 1, seq_len, seq_len)
-        prefix = prefix_mask.view(-1, 1, 1, seq_len)
-        cannot_attend = ~torch.logical_or(causal, prefix.bool())
-        min_val = torch.finfo(attn_bias.dtype).min
-        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
-        return attn_bias
-
-    def _apply_sequence_id(self, attn_bias: torch.Tensor, sequence_id: torch.LongTensor):
-        seq_len = sequence_id.shape[-1]
-        if seq_len > self.config.max_seq_len:
-            raise ValueError(f"sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}")
-        attn_bias = attn_bias[..., :seq_len, :seq_len]
-        cannot_attend = torch.logical_not(torch.eq(sequence_id.view(-1, seq_len, 1), sequence_id.view(-1, 1, seq_len))).unsqueeze(1)
-        min_val = torch.finfo(attn_bias.dtype).min
-        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
-        return attn_bias
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
-        attention_mask: Optional[torch.ByteTensor] = None,
-        prefix_mask: Optional[torch.ByteTensor] = None,
-        sequence_id: Optional[torch.LongTensor] = None,
-        return_dict: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        use_cache: Optional[bool] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ):
-        return_dict = return_dict if return_dict is not None else self.config.return_dict
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        if attention_mask is not None:
-            attention_mask = attention_mask.bool()
-
-        if prefix_mask is not None:
-            prefix_mask = prefix_mask.bool()
-
-        # These args are passed in by keyword in huggingface's generate function
-        # https://github.com/huggingface/transformers/blob/68287689f2f0d8b7063c400230b3766987abf18d/src/transformers/generation/utils.py#L2201-L2206
-        # but have not yet been fully implemented in MPTModel
-        if not return_dict:
-            raise NotImplementedError("return_dict False is not implemented yet for MPT")
-        if output_attentions:
-            if self.attn_impl != "torch":
-                raise NotImplementedError("output_attentions is not implemented for MPT when using attn_impl `flash` or `triton`.")
-
-        if attention_mask is not None and attention_mask[:, 0].sum() != attention_mask.shape[0] and self.training:
-            raise NotImplementedError("MPT does not support training with left padding.")
-
-        if self.prefix_lm and prefix_mask is None:
-            raise ValueError("prefix_mask is a required argument when MPT is configured with prefix_lm=True.")
-
-        # Raise a not implemented error if input_embeds is not None (this is an arg in huggingface transformers and we need to support it for PEFT)
-        if inputs_embeds is not None:
-            raise NotImplementedError("inputs_embeds is not implemented for MPT.")
-
-        if self.training:
-            if self.attn_uses_sequence_id and sequence_id is None:
-                raise ValueError(
-                    "sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True " + "and the model is in train mode."
-                )
-            elif (self.attn_uses_sequence_id is False) and (sequence_id is not None):
-                warnings.warn(
-                    "MPT received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. "
-                    + "This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True."
-                )
-
-        S = input_ids.size(1)
-
-        assert S <= self.config.max_seq_len, f"Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}"
-
-        tok_emb = self.wte(input_ids)  # type: ignore
-        if self.alibi:
-            x = tok_emb
-        else:
-            past_position = 0
-            if past_key_values is not None:
-                if len(past_key_values) != self.config.n_layers:
-                    raise ValueError(
-                        f"past_key_values must provide a past_key_value for each attention "
-                        + f"layer in the network ({len(past_key_values)=}; {self.config.n_layers=})."
-                    )
-                # For attn_impl: triton and flash the past key tensor spec is (batch, seq, dim).
-                # For attn_impl: torch the past key tensor spec is (batch, heads, head_dim, seq).
-                # Here we shift position embedding using the `seq` dim of the past key
-                past_position = past_key_values[0][0].size(1)
-                if self.attn_impl == "torch":
-                    past_position = past_key_values[0][0].size(3)
-
-            if S + past_position > self.config.max_seq_len:
-                raise ValueError(
-                    f"Cannot forward input with past sequence length {past_position} and current sequence length "
-                    f"{S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}."
-                )
-            pos = torch.arange(
-                past_position,
-                S + past_position,
-                dtype=torch.long,
-                device=input_ids.device,
-            ).unsqueeze(0)
-            if attention_mask is not None:
-                # adjust the position indices to account for padding tokens
-                pos = torch.clamp(
-                    pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[:, past_position:],
-                    min=0,
-                )
-
-            pos_emb = self.wpe(pos)  # type: ignore
-            x = tok_emb + pos_emb
-
-        if self.embedding_fraction == 1:
-            x = self.emb_drop(x)  # type: ignore
-        else:
-            # this implementation is proposed on page 7 of the GLM-130B paper https://arxiv.org/abs/2210.02414
-            x_shrunk = (x * self.embedding_fraction) + (x.detach() * (1 - self.embedding_fraction))
-            assert isinstance(self.emb_drop, nn.Module)  # pyright
-            x = self.emb_drop(x_shrunk)
-
-        attn_bias, attention_mask = self._attn_bias(
-            device=x.device,
-            dtype=torch.float32,
-            attention_mask=attention_mask,
-            prefix_mask=prefix_mask,
-            sequence_id=sequence_id,
-        )
-
-        # initialize the past key values cache if it should be used
-        if use_cache and past_key_values is None:
-            past_key_values = [() for _ in range(self.config.n_layers)]  # type: ignore
-
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        for b_idx, block in enumerate(self.blocks):  # type: ignore
-            if output_hidden_states:
-                assert all_hidden_states is not None  # pyright
-                all_hidden_states = all_hidden_states + (x,)
-            past_key_value = past_key_values[b_idx] if past_key_values is not None else None
-            x, attn_weights, past_key_value = block(
-                x,
-                past_key_value=past_key_value,
-                attn_bias=attn_bias,
-                attention_mask=attention_mask,
-                is_causal=self.is_causal,
-            )
-            if past_key_values is not None:
-                past_key_values[b_idx] = past_key_value
-
-            if output_attentions:
-                assert all_self_attns is not None  # pyright
-                all_self_attns = all_self_attns + (attn_weights,)
-
-        x = self.norm_f(x)  # type: ignore
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            assert all_hidden_states is not None  # pyright
-            all_hidden_states = all_hidden_states + (x,)
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=x,
-            past_key_values=past_key_values,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-    # Param Initialization, needed for device='meta' fast initialization
-    def param_init_fn(self, module):
-        init_fn_name = self.config.init_config["name"]
-        MODEL_INIT_REGISTRY[init_fn_name](
-            module=module,
-            n_layers=self.config.n_layers,
-            d_model=self.config.d_model,
-            **self.config.init_config,
-        )
-
-    def fsdp_wrap_fn(self, module):
-        return isinstance(module, MPTBlock)
-
-    def activation_checkpointing_fn(self, module):
-        return isinstance(module, MPTBlock)
-
-
-class MPTForCausalLM(MPTPreTrainedModel):
-    def __init__(self, config: MPTConfig):
-        super().__init__(config)
-        if not config.tie_word_embeddings:
-            raise ValueError("MPTForCausalLM only supports tied word embeddings")
-        self.transformer = MPTModel(config)
-        for child in self.transformer.children():
-            if isinstance(child, torch.nn.ModuleList):
-                continue
-            if isinstance(child, torch.nn.Module):
-                child._fsdp_wrap = True
-        self.logit_scale = None
-        if config.logit_scale is not None:
-            logit_scale = config.logit_scale
-            if isinstance(logit_scale, str):
-                if logit_scale == "inv_sqrt_d_model":
-                    logit_scale = 1 / math.sqrt(config.d_model)
-                else:
-                    raise ValueError(f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
-            self.logit_scale = logit_scale
-
-    def get_input_embeddings(self):
-        return self.transformer.wte
-
-    def set_input_embeddings(self, value):
-        # self.transformer.wte = value
-        peudo_wte = SharedEmbedding(value.weight.shape[0], value.weight.shape[1], device=self.transformer.wte.weight.device)
-        peudo_wte.weight = value.weight
-        self.transformer.wte = peudo_wte
-
-    def get_output_embeddings(self):
-        return self.transformer.wte
-
-    def set_output_embeddings(self, new_embeddings):
-        # self.transformer.wte = new_embeddings
-        peudo_wte = SharedEmbedding(new_embeddings.weight.shape[0], new_embeddings.weight.shape[1], device=self.transformer.wte.weight.device)
-        peudo_wte.weight = new_embeddings.weight
-        self.transformer.wte = peudo_wte
-
-    def set_decoder(self, decoder):
-        self.transformer = decoder
-
-    def get_decoder(self):
-        return self.transformer
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
-        attention_mask: Optional[torch.ByteTensor] = None,
-        prefix_mask: Optional[torch.ByteTensor] = None,
-        sequence_id: Optional[torch.LongTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        return_dict: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        use_cache: Optional[bool] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-    ):
-        return_dict = return_dict if return_dict is not None else self.config.return_dict
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        # if input_embeds is not none, raise a not implemented error
-        if inputs_embeds is not None:
-            raise NotImplementedError("inputs_embeds has to be None (for hf/peft support).")
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.transformer(
-            input_ids=input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            prefix_mask=prefix_mask,
-            sequence_id=sequence_id,
-            return_dict=return_dict,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            use_cache=use_cache,
-        )
-
-        # move outputs to same device as weights for token embedding
-        # needed to support HF `device_map`
-        logits = self.transformer.wte(
-            input=outputs.last_hidden_state.to(self.transformer.wte.weight.device),
-            unembed=True,
-        )
-
-        if self.logit_scale is not None:
-            if self.logit_scale == 0:
-                warnings.warn(f"Multiplying logits by {self.logit_scale=}. This will produce uniform (uninformative) outputs.")
-            logits *= self.logit_scale
-
-        loss = None
-        if labels is not None:
-            _labels = torch.roll(labels, shifts=-1)
-            _labels[:, -1] = -100
-            loss = F.cross_entropy(
-                logits.view(-1, logits.size(-1)),
-                _labels.to(logits.device).view(-1),
-            )
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def param_init_fn(self, module):
-        init_fn_name = self.config.init_config["name"]
-        MODEL_INIT_REGISTRY[init_fn_name](module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config)
-
-    def fsdp_wrap_fn(self, module):
-        return isinstance(module, MPTBlock)
-
-    def activation_checkpointing_fn(self, module):
-        return isinstance(module, MPTBlock)
-
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None, **kwargs):
-        if inputs_embeds is not None:
-            raise NotImplementedError("inputs_embeds is not implemented for MPT yet")
-        attention_mask = attention_mask.bool()
-        if attention_mask[:, -1].sum() != attention_mask.shape[0]:
-            raise NotImplementedError("MPT does not support generation with right padding.")
-        if self.transformer.attn_uses_sequence_id and self.training:
-            sequence_id = torch.zeros_like(input_ids[:1])
-        else:
-            sequence_id = None
-        if past_key_values is not None:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-        if self.transformer.prefix_lm:
-            prefix_mask = torch.ones_like(attention_mask)
-            if kwargs.get("use_cache") == False:
-                raise NotImplementedError("MPT with prefix_lm=True does not support use_cache=False.")
-        else:
-            prefix_mask = None
-        return {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "prefix_mask": prefix_mask,
-            "sequence_id": sequence_id,
-            "past_key_values": past_key_values,
-            "use_cache": kwargs.get("use_cache", True),
-        }
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        """Used by HuggingFace generate when using beam search with kv-caching.
-
-        See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133
-        for an example in transformers.
-        """
-        reordered_past = []
-        for layer_past in past_key_values:
-            reordered_past += [tuple((past_state.index_select(0, beam_idx) for past_state in layer_past))]
-        return reordered_past
diff --git a/mllm/flamingo/mpt/norm.py b/mllm/flamingo/mpt/norm.py
deleted file mode 100644
index 9eb43abb1ce81b8168a936461796fd4fda8cb531..0000000000000000000000000000000000000000
--- a/mllm/flamingo/mpt/norm.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import torch
-
-
-def _cast_if_autocast_enabled(tensor):
-    if torch.is_autocast_enabled():
-        if tensor.device.type == "cuda":
-            dtype = torch.get_autocast_gpu_dtype()
-        elif tensor.device.type == "cpu":
-            dtype = torch.get_autocast_cpu_dtype()
-        else:
-            raise NotImplementedError()
-        return tensor.to(dtype=dtype)
-    return tensor
-
-
-class LPLayerNorm(torch.nn.LayerNorm):
-    def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None):
-        super().__init__(normalized_shape=normalized_shape, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
-
-    def forward(self, x):
-        module_device = x.device
-        downcast_x = _cast_if_autocast_enabled(x)
-        downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
-        downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
-        with torch.autocast(enabled=False, device_type=module_device.type):
-            return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
-
-
-def rms_norm(x, weight=None, eps=1e-05):
-    output = x / torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
-    if weight is not None:
-        return output * weight
-    return output
-
-
-class RMSNorm(torch.nn.Module):
-    def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
-        super().__init__()
-        self.eps = eps
-        if weight:
-            self.weight = torch.nn.Parameter(torch.ones(normalized_shape, dtype=dtype, device=device))
-        else:
-            self.register_parameter("weight", None)
-
-    def forward(self, x):
-        return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype)
-
-
-class LPRMSNorm(RMSNorm):
-    def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
-        super().__init__(normalized_shape=normalized_shape, eps=eps, weight=weight, dtype=dtype, device=device)
-
-    def forward(self, x):
-        downcast_x = _cast_if_autocast_enabled(x)
-        downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
-        with torch.autocast(enabled=False, device_type=x.device.type):
-            return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
-
-
-NORM_CLASS_REGISTRY = {"layernorm": torch.nn.LayerNorm, "low_precision_layernorm": LPLayerNorm, "rmsnorm": RMSNorm, "low_precision_rmsnorm": LPRMSNorm}
diff --git a/mllm/flamingo/mpt/param_init_fns.py b/mllm/flamingo/mpt/param_init_fns.py
deleted file mode 100644
index f1bfa6722b8d2d7abbe2d608ab02afe67adcdcde..0000000000000000000000000000000000000000
--- a/mllm/flamingo/mpt/param_init_fns.py
+++ /dev/null
@@ -1,369 +0,0 @@
-import math
-import warnings
-from collections.abc import Sequence
-from functools import partial
-from typing import Optional, Tuple, Union
-import torch
-from torch import nn
-from .norm import NORM_CLASS_REGISTRY
-
-
-def torch_default_param_init_fn_(module: nn.Module, verbose: int = 0, **kwargs):
-    del kwargs
-    if verbose > 1:
-        warnings.warn(f"Initializing network using module's reset_parameters attribute")
-    if hasattr(module, "reset_parameters"):
-        module.reset_parameters()
-
-
-def fused_init_helper_(module: nn.Module, init_fn_):
-    _fused = getattr(module, "_fused", None)
-    if _fused is None:
-        raise RuntimeError(f"Internal logic error")
-    (dim, splits) = _fused
-    splits = (0, *splits, module.weight.size(dim))
-    for s, e in zip(splits[:-1], splits[1:]):
-        slice_indices = [slice(None)] * module.weight.ndim
-        slice_indices[dim] = slice(s, e)
-        init_fn_(module.weight[slice_indices])
-
-
-def generic_param_init_fn_(
-    module: nn.Module,
-    init_fn_,
-    n_layers: int,
-    d_model: Optional[int] = None,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs
-    if verbose > 1:
-        warnings.warn(f"If model has bias parameters they are initialized to 0.")
-    init_div_is_residual = init_div_is_residual
-    if init_div_is_residual is False:
-        div_is_residual = 1.0
-    elif init_div_is_residual is True:
-        div_is_residual = math.sqrt(2 * n_layers)
-    elif isinstance(init_div_is_residual, float) or isinstance(init_div_is_residual, int):
-        div_is_residual = init_div_is_residual
-    elif isinstance(init_div_is_residual, str) and init_div_is_residual.isnumeric():
-        div_is_residual = float(init_div_is_residual)
-    else:
-        div_is_residual = 1.0
-        raise ValueError(f"Expected init_div_is_residual to be boolean or numeric, got {init_div_is_residual}")
-    if init_div_is_residual is not False:
-        if verbose > 1:
-            warnings.warn(
-                f"Initializing _is_residual layers then dividing them by {div_is_residual:.3f}. "
-                + f"Set `init_div_is_residual: false` in init config to disable this."
-            )
-    if isinstance(module, nn.Linear):
-        if hasattr(module, "_fused"):
-            fused_init_helper_(module, init_fn_)
-        else:
-            init_fn_(module.weight)
-        if module.bias is not None:
-            torch.nn.init.zeros_(module.bias)
-        if init_div_is_residual is not False and getattr(module, "_is_residual", False):
-            with torch.no_grad():
-                module.weight.div_(div_is_residual)
-    elif isinstance(module, nn.Embedding):
-        if emb_init_std is not None:
-            std = emb_init_std
-            if std == 0:
-                warnings.warn(f"Embedding layer initialized to 0.")
-            emb_init_fn_ = partial(torch.nn.init.normal_, mean=0.0, std=std)
-            if verbose > 1:
-                warnings.warn(f"Embedding layer initialized using normal distribution with mean=0 and std={std!r}.")
-        elif emb_init_uniform_lim is not None:
-            lim = emb_init_uniform_lim
-            if isinstance(lim, Sequence):
-                if len(lim) > 2:
-                    raise ValueError(f"Uniform init requires a min and a max limit. User input: {lim}.")
-                if lim[0] == lim[1]:
-                    warnings.warn(f"Embedding layer initialized to {lim[0]}.")
-            else:
-                if lim == 0:
-                    warnings.warn(f"Embedding layer initialized to 0.")
-                lim = [-lim, lim]
-            (a, b) = lim
-            emb_init_fn_ = partial(torch.nn.init.uniform_, a=a, b=b)
-            if verbose > 1:
-                warnings.warn(f"Embedding layer initialized using uniform distribution in range {lim}.")
-        else:
-            emb_init_fn_ = init_fn_
-        emb_init_fn_(module.weight)
-    elif isinstance(module, tuple(set(NORM_CLASS_REGISTRY.values()))):
-        if verbose > 1:
-            warnings.warn(f"Norm weights are set to 1. If norm layer has a bias it is initialized to 0.")
-        if hasattr(module, "weight") and module.weight is not None:
-            torch.nn.init.ones_(module.weight)
-        if hasattr(module, "bias") and module.bias is not None:
-            torch.nn.init.zeros_(module.bias)
-    elif isinstance(module, nn.MultiheadAttention):
-        if module._qkv_same_embed_dim:
-            assert module.in_proj_weight is not None
-            assert module.q_proj_weight is None and module.k_proj_weight is None and (module.v_proj_weight is None)
-            assert d_model is not None
-            _d = d_model
-            splits = (0, _d, 2 * _d, 3 * _d)
-            for s, e in zip(splits[:-1], splits[1:]):
-                init_fn_(module.in_proj_weight[s:e])
-        else:
-            assert module.q_proj_weight is not None and module.k_proj_weight is not None and (module.v_proj_weight is not None)
-            assert module.in_proj_weight is None
-            init_fn_(module.q_proj_weight)
-            init_fn_(module.k_proj_weight)
-            init_fn_(module.v_proj_weight)
-        if module.in_proj_bias is not None:
-            torch.nn.init.zeros_(module.in_proj_bias)
-        if module.bias_k is not None:
-            torch.nn.init.zeros_(module.bias_k)
-        if module.bias_v is not None:
-            torch.nn.init.zeros_(module.bias_v)
-        init_fn_(module.out_proj.weight)
-        if init_div_is_residual is not False and getattr(module.out_proj, "_is_residual", False):
-            with torch.no_grad():
-                module.out_proj.weight.div_(div_is_residual)
-        if module.out_proj.bias is not None:
-            torch.nn.init.zeros_(module.out_proj.bias)
-    else:
-        for _ in module.parameters(recurse=False):
-            raise NotImplementedError(f"{module.__class__.__name__} parameters are not initialized by param_init_fn.")
-
-
-def _normal_init_(std, mean=0.0):
-    return partial(torch.nn.init.normal_, mean=mean, std=std)
-
-
-def _normal_param_init_fn_(
-    module: nn.Module,
-    std: float,
-    n_layers: int,
-    d_model: Optional[int] = None,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs
-    init_fn_ = _normal_init_(std=std)
-    if verbose > 1:
-        warnings.warn(f"Using torch.nn.init.normal_ init fn mean=0.0, std={std}")
-    generic_param_init_fn_(
-        module=module,
-        init_fn_=init_fn_,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=init_div_is_residual,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-
-
-def baseline_param_init_fn_(
-    module: nn.Module,
-    init_std: float,
-    n_layers: int,
-    d_model: Optional[int] = None,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs
-    if init_std is None:
-        raise ValueError("You must set model.init_config['init_std'] to a float value to use the default initialization scheme.")
-    _normal_param_init_fn_(
-        module=module,
-        std=init_std,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=init_div_is_residual,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-
-
-def small_param_init_fn_(
-    module: nn.Module,
-    n_layers: int,
-    d_model: int,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs
-    std = math.sqrt(2 / (5 * d_model))
-    _normal_param_init_fn_(
-        module=module,
-        std=std,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=init_div_is_residual,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-
-
-def neox_param_init_fn_(
-    module: nn.Module,
-    n_layers: int,
-    d_model: int,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    verbose: int = 0,
-    **kwargs,
-):
-    """From section 2.3.1 of GPT-NeoX-20B:
-
-    An Open-Source AutoregressiveLanguage Model — Black et. al. (2022)
-    see https://github.com/EleutherAI/gpt-neox/blob/9610391ab319403cef079b438edd016a2443af54/megatron/model/init_functions.py#L151
-    and https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/transformer.py
-    """
-    del kwargs
-    residual_div = n_layers / math.sqrt(10)
-    if verbose > 1:
-        warnings.warn(f"setting init_div_is_residual to {residual_div}")
-    small_param_init_fn_(
-        module=module,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=residual_div,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-
-
-def kaiming_uniform_param_init_fn_(
-    module: nn.Module,
-    n_layers: int,
-    d_model: Optional[int] = None,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    init_gain: float = 0,
-    fan_mode: str = "fan_in",
-    init_nonlinearity: str = "leaky_relu",
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs
-    if verbose > 1:
-        warnings.warn(f"Using nn.init.kaiming_uniform_ init fn with parameters: " + f"a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}")
-    kaiming_uniform_ = partial(nn.init.kaiming_uniform_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
-    generic_param_init_fn_(
-        module=module,
-        init_fn_=kaiming_uniform_,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=init_div_is_residual,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-
-
-def kaiming_normal_param_init_fn_(
-    module: nn.Module,
-    n_layers: int,
-    d_model: Optional[int] = None,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    init_gain: float = 0,
-    fan_mode: str = "fan_in",
-    init_nonlinearity: str = "leaky_relu",
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs
-    if verbose > 1:
-        warnings.warn(f"Using nn.init.kaiming_normal_ init fn with parameters: " + f"a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}")
-    kaiming_normal_ = partial(torch.nn.init.kaiming_normal_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
-    generic_param_init_fn_(
-        module=module,
-        init_fn_=kaiming_normal_,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=init_div_is_residual,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-
-
-def xavier_uniform_param_init_fn_(
-    module: nn.Module,
-    n_layers: int,
-    d_model: Optional[int] = None,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    init_gain: float = 0,
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs
-    xavier_uniform_ = partial(torch.nn.init.xavier_uniform_, gain=init_gain)
-    if verbose > 1:
-        warnings.warn(f"Using torch.nn.init.xavier_uniform_ init fn with parameters: " + f"gain={init_gain}")
-    generic_param_init_fn_(
-        module=module,
-        init_fn_=xavier_uniform_,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=init_div_is_residual,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-
-
-def xavier_normal_param_init_fn_(
-    module: nn.Module,
-    n_layers: int,
-    d_model: Optional[int] = None,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    init_gain: float = 0,
-    verbose: int = 0,
-    **kwargs,
-):
-    xavier_normal_ = partial(torch.nn.init.xavier_normal_, gain=init_gain)
-    if verbose > 1:
-        warnings.warn(f"Using torch.nn.init.xavier_normal_ init fn with parameters: " + f"gain={init_gain}")
-    generic_param_init_fn_(
-        module=module,
-        init_fn_=xavier_normal_,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=init_div_is_residual,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-
-
-MODEL_INIT_REGISTRY = {
-    "default_": torch_default_param_init_fn_,
-    "baseline_": baseline_param_init_fn_,
-    "kaiming_uniform_": kaiming_uniform_param_init_fn_,
-    "kaiming_normal_": kaiming_normal_param_init_fn_,
-    "neox_init_": neox_param_init_fn_,
-    "small_init_": small_param_init_fn_,
-    "xavier_uniform_": xavier_uniform_param_init_fn_,
-    "xavier_normal_": xavier_normal_param_init_fn_,
-}
diff --git a/mllm/flamingo/mpt_redpajama/__init__.py b/mllm/flamingo/mpt_redpajama/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/mllm/flamingo/mpt_redpajama/__pycache__/__init__.cpython-39.pyc b/mllm/flamingo/mpt_redpajama/__pycache__/__init__.cpython-39.pyc
deleted file mode 100644
index 119409cfe74ad242a975380615f1bd5d5bcc9612..0000000000000000000000000000000000000000
Binary files a/mllm/flamingo/mpt_redpajama/__pycache__/__init__.cpython-39.pyc and /dev/null differ
diff --git a/mllm/flamingo/mpt_redpajama/__pycache__/attention.cpython-39.pyc b/mllm/flamingo/mpt_redpajama/__pycache__/attention.cpython-39.pyc
deleted file mode 100644
index f1fe4ed95c30a6b7edd129bf0aa8e54c38c4e427..0000000000000000000000000000000000000000
Binary files a/mllm/flamingo/mpt_redpajama/__pycache__/attention.cpython-39.pyc and /dev/null differ
diff --git a/mllm/flamingo/mpt_redpajama/__pycache__/configuration_mosaic_gpt.cpython-39.pyc b/mllm/flamingo/mpt_redpajama/__pycache__/configuration_mosaic_gpt.cpython-39.pyc
deleted file mode 100644
index d58ba26867465859464c53702e435612bd755528..0000000000000000000000000000000000000000
Binary files a/mllm/flamingo/mpt_redpajama/__pycache__/configuration_mosaic_gpt.cpython-39.pyc and /dev/null differ
diff --git a/mllm/flamingo/mpt_redpajama/__pycache__/gpt_blocks.cpython-39.pyc b/mllm/flamingo/mpt_redpajama/__pycache__/gpt_blocks.cpython-39.pyc
deleted file mode 100644
index d2387736a3dca169fa862e7fc39634bd9ed14692..0000000000000000000000000000000000000000
Binary files a/mllm/flamingo/mpt_redpajama/__pycache__/gpt_blocks.cpython-39.pyc and /dev/null differ
diff --git a/mllm/flamingo/mpt_redpajama/__pycache__/low_precision_layernorm.cpython-39.pyc b/mllm/flamingo/mpt_redpajama/__pycache__/low_precision_layernorm.cpython-39.pyc
deleted file mode 100644
index 7e2d41ed98085cbb6900c7e30cf0e6a0491a054b..0000000000000000000000000000000000000000
Binary files a/mllm/flamingo/mpt_redpajama/__pycache__/low_precision_layernorm.cpython-39.pyc and /dev/null differ
diff --git a/mllm/flamingo/mpt_redpajama/__pycache__/mosaic_gpt.cpython-39.pyc b/mllm/flamingo/mpt_redpajama/__pycache__/mosaic_gpt.cpython-39.pyc
deleted file mode 100644
index 690e45d761adae1e16b297280ba3f0e6eaa96962..0000000000000000000000000000000000000000
Binary files a/mllm/flamingo/mpt_redpajama/__pycache__/mosaic_gpt.cpython-39.pyc and /dev/null differ
diff --git a/mllm/flamingo/mpt_redpajama/__pycache__/param_init_fns.cpython-39.pyc b/mllm/flamingo/mpt_redpajama/__pycache__/param_init_fns.cpython-39.pyc
deleted file mode 100644
index c361d5b8699cffe978a22d1563e0060d251d2971..0000000000000000000000000000000000000000
Binary files a/mllm/flamingo/mpt_redpajama/__pycache__/param_init_fns.cpython-39.pyc and /dev/null differ
diff --git a/mllm/flamingo/mpt_redpajama/attention.py b/mllm/flamingo/mpt_redpajama/attention.py
deleted file mode 100644
index 4dd08dab9f5c0ccd5df7eef24f8a013b37f23ca2..0000000000000000000000000000000000000000
--- a/mllm/flamingo/mpt_redpajama/attention.py
+++ /dev/null
@@ -1,361 +0,0 @@
-# Copyright 2022 MosaicML Examples authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Attention layers."""
-
-import math
-import warnings
-from typing import Optional
-
-import torch
-import torch.nn as nn
-from einops import rearrange
-from torch import nn
-
-from .low_precision_layernorm import LPLayerNorm
-
-
-def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool):
-    if original_is_causal and num_query_tokens != num_key_tokens:
-        if num_query_tokens != 1:
-            raise NotImplementedError("MosaicGPT does not support query and key with different number of tokens, unless number of query tokens is 1.")
-        else:
-            return False
-    return original_is_causal
-
-
-def scaled_multihead_dot_product_attention(
-    query,
-    key,
-    value,
-    n_heads,
-    softmax_scale=None,
-    attn_bias=None,
-    key_padding_mask=None,
-    is_causal=False,
-    dropout_p=0.0,
-    training=False,
-    needs_weights=False,
-):
-    q = rearrange(query, "b s (h d) -> b h s d", h=n_heads)
-    k = rearrange(key, "b s (h d) -> b h d s", h=n_heads)  # includes key.t()
-    v = rearrange(value, "b s (h d) -> b h s d", h=n_heads)
-
-    min_val = torch.finfo(q.dtype).min
-
-    b, _, s_q, d = q.shape
-    s_k = k.size(-1)
-
-    if softmax_scale is None:
-        softmax_scale = 1 / math.sqrt(d)
-
-    attn_weight = q.matmul(k) * softmax_scale
-
-    if attn_bias is not None:
-        if (attn_bias.size(-1) != 1 and attn_bias.size(-1) != s_k) or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q):
-            raise RuntimeError(f"attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.")
-        attn_weight = attn_weight + attn_bias
-
-    if key_padding_mask is not None:
-        if attn_bias is not None:
-            warnings.warn(
-                "Propogating key_padding_mask to the attention module "
-                + "and applying it within the attention module can cause "
-                + "unneccessary computation/memory usage. Consider integrating "
-                + "into attn_bias once and passing that to each attention "
-                + "module instead."
-            )
-        attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
-
-    if is_causal:
-        s = max(s_q, s_k)
-        causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
-        causal_mask = causal_mask.tril()
-        causal_mask = causal_mask.to(torch.bool)
-        causal_mask = ~causal_mask
-        causal_mask = causal_mask[-s_q:, -s_k:]
-        attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k), min_val)
-
-    attn_weight = torch.softmax(attn_weight, dim=-1)
-
-    if dropout_p:
-        attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
-
-    out = attn_weight.matmul(v)
-    out = rearrange(out, "b h s d -> b s (h d)")
-
-    if needs_weights:
-        return out, attn_weight
-    return out, None
-
-
-def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
-    for tensor in tensors:
-        if tensor.dtype not in valid_dtypes:
-            raise TypeError(f"{tensor.dtype=} must be in {valid_dtypes=}.")
-        if not tensor.is_cuda:
-            raise TypeError(f"Inputs must be cuda tensors ({tensor.is_cuda=}).")
-
-
-def flash_attn_fn(
-    query,
-    key,
-    value,
-    n_heads,
-    softmax_scale=None,
-    attn_bias=None,
-    key_padding_mask=None,
-    is_causal=False,
-    dropout_p=0.0,
-    training=False,
-    needs_weights=False,
-):
-    try:
-        from flash_attn import bert_padding, flash_attn_interface
-    except:
-        raise RuntimeError("Please install flash_attn==0.2.8")
-
-    check_valid_inputs(query, key, value)
-
-    if attn_bias is not None:
-        raise NotImplementedError(f"attn_bias not implemented for flash attn.")
-
-    batch_size, seqlen = query.shape[:2]
-
-    if key_padding_mask is None:
-        key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
-    query_padding_mask = key_padding_mask[:, -query.size(1) :]
-
-    query_unpad, indices_q, cu_seqlens_q, max_seqlen_q = bert_padding.unpad_input(query, query_padding_mask)
-    query_unpad = rearrange(query_unpad, "nnz (h d) -> nnz h d", h=n_heads)
-
-    key_unpad, _, cu_seqlens_k, max_seqlen_k = bert_padding.unpad_input(key, key_padding_mask)
-    key_unpad = rearrange(key_unpad, "nnz (h d) -> nnz h d", h=n_heads)
-
-    value_unpad, _, _, _ = bert_padding.unpad_input(value, key_padding_mask)
-    value_unpad = rearrange(value_unpad, "nnz (h d) -> nnz h d", h=n_heads)
-
-    dropout_p = dropout_p if training else 0.0
-
-    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
-
-    output_unpad = flash_attn_interface.flash_attn_unpadded_func(
-        query_unpad,
-        key_unpad,
-        value_unpad,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale=softmax_scale,
-        causal=reset_is_causal,
-        return_attn_probs=needs_weights,
-    )
-
-    output = bert_padding.pad_input(rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices_q, batch_size, seqlen)
-    return output, None
-
-
-def triton_flash_attn_fn(
-    query,
-    key,
-    value,
-    n_heads,
-    softmax_scale=None,
-    attn_bias=None,
-    key_padding_mask=None,
-    is_causal=False,
-    dropout_p=0.0,
-    training=False,
-    needs_weights=False,
-):
-    try:
-        from flash_attn import flash_attn_triton  # type: ignore
-    except:
-        raise RuntimeError("Please install flash_attn==0.2.8 and triton==2.0.0.dev20221202.")
-
-    check_valid_inputs(query, key, value)
-
-    if dropout_p:
-        raise NotImplementedError(f"Dropout not implemented for attn_impl: triton.")
-
-    if needs_weights:
-        raise NotImplementedError(f"attn_impl: triton cannot return attn weights.")
-
-    if key_padding_mask is not None:
-        warnings.warn(
-            "Propagating key_padding_mask to the attention module "
-            + "and applying it within the attention module can cause "
-            + "unnecessary computation/memory usage. Consider integrating "
-            + "into attn_bias once and passing that to each attention "
-            + "module instead."
-        )
-        b_size, s_k = key_padding_mask.shape[:2]
-
-        if attn_bias is None:
-            attn_bias = query.new_zeros(b_size, 1, 1, s_k)
-
-        attn_bias = attn_bias.masked_fill(~key_padding_mask.view((b_size, 1, 1, s_k)), torch.finfo(query.dtype).min)
-
-    query = rearrange(query, "b s (h d) -> b s h d", h=n_heads)
-    key = rearrange(key, "b s (h d) -> b s h d", h=n_heads)
-    value = rearrange(value, "b s (h d) -> b s h d", h=n_heads)
-
-    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
-    attn_output = flash_attn_triton.flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
-
-    output = attn_output.view(*attn_output.shape[:2], -1)
-
-    return output, None
-
-
-class MultiheadAttention(nn.Module):
-    """Multi-head self attention.
-
-    Using torch or triton attention implemetation enables user to also use
-    additive bias.
-    """
-
-    def __init__(
-        self,
-        d_model: int,
-        n_heads: int,
-        attn_impl: str = "triton",
-        attn_clip_qkv: Optional[float] = None,
-        attn_qk_ln: bool = False,
-        softmax_scale: Optional[float] = None,
-        attn_pdrop: float = 0.0,
-        low_precision_layernorm: bool = False,
-        device: Optional[str] = None,
-    ):
-        super().__init__()
-
-        self.attn_impl = attn_impl
-        self.clip_qkv = attn_clip_qkv
-        self.attn_qk_ln = attn_qk_ln
-
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.softmax_scale = softmax_scale
-        if self.softmax_scale is None:
-            self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
-        self.attn_dropout_p = attn_pdrop
-
-        self.Wqkv = nn.Linear(self.d_model, 3 * self.d_model, device=device)
-        # for param init fn; enables shape based init of fused layers
-        fuse_splits = (d_model, 2 * d_model)
-        self.Wqkv._fused = (0, fuse_splits)  # type: ignore
-
-        if self.attn_qk_ln:
-            layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
-            self.q_ln = layernorm_class(self.d_model, device=device)
-            self.k_ln = layernorm_class(self.d_model, device=device)
-
-        if self.attn_impl == "flash":
-            self.attn_fn = flash_attn_fn
-        elif self.attn_impl == "triton":
-            self.attn_fn = triton_flash_attn_fn
-            warnings.warn(
-                "While `attn_impl: triton` can be faster than `attn_impl: flash` "
-                + "it uses more memory. When training larger models this can trigger "
-                + "alloc retries which hurts performance. If encountered, we recommend "
-                + "using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`."
-            )
-        elif self.attn_impl == "torch":
-            self.attn_fn = scaled_multihead_dot_product_attention
-            if torch.cuda.is_available():
-                warnings.warn(
-                    "Using `attn_impl: torch`. If your model does not use `alibi` or "
-                    + "`prefix_lm` we recommend using `attn_impl: flash` otherwise "
-                    + "we recommend using `attn_impl: triton`."
-                )
-        else:
-            raise ValueError(f"{attn_impl=} is an invalid setting.")
-
-        self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
-        self.out_proj._is_residual = True  # type: ignore
-
-    def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
-        qkv = self.Wqkv(x)
-
-        if self.clip_qkv:
-            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
-
-        query, key, value = qkv.chunk(3, dim=2)
-
-        key_padding_mask = attention_mask
-
-        if self.attn_qk_ln:
-            # Applying layernorm to qk
-            dtype = query.dtype
-            query = self.q_ln(query).to(dtype)
-            key = self.k_ln(key).to(dtype)
-
-        if past_key_value is not None:
-            if len(past_key_value) != 0:
-                key = torch.cat([past_key_value[0], key], dim=1)
-                value = torch.cat([past_key_value[1], value], dim=1)
-
-            past_key_value = (key, value)
-
-        if attn_bias is not None:
-            attn_bias = attn_bias[:, :, -query.size(1) :, -key.size(1) :]
-
-        context, attn_weights = self.attn_fn(
-            query,
-            key,
-            value,
-            self.n_heads,
-            softmax_scale=self.softmax_scale,
-            attn_bias=attn_bias,
-            key_padding_mask=key_padding_mask,
-            is_causal=is_causal,
-            dropout_p=self.attn_dropout_p,
-            training=self.training,
-            needs_weights=needs_weights,
-        )
-
-        return self.out_proj(context), attn_weights, past_key_value
-
-
-def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id):
-    if attn_impl == "flash":
-        return None
-    elif attn_impl in ["torch", "triton"]:
-        if alibi:
-            if (prefix_lm or not causal) or use_sequence_id:
-                return (1, n_heads, seq_len, seq_len)
-            return (1, n_heads, 1, seq_len)
-        elif prefix_lm or use_sequence_id:
-            return (1, 1, seq_len, seq_len)
-        return None
-    else:
-        raise ValueError(f"{attn_impl=} is an invalid setting.")
-
-
-def attn_bias(attn_impl, attn_bias, n_heads, seq_len, causal=False, alibi=False, alibi_bias_max=8):
-    if attn_impl == "flash":
-        return None
-    elif attn_impl in ["torch", "triton"]:
-        if alibi:
-            # in place add alibi to attn bias
-            device, dtype = attn_bias.device, attn_bias.dtype
-            attn_bias = attn_bias.add(alibi_bias(n_heads, seq_len, full=not causal, alibi_bias_max=alibi_bias_max, device=device, dtype=dtype))
-        return attn_bias
-    else:
-        raise ValueError(f"{attn_impl=} is an invalid setting.")
-
-
-def alibi_bias(n_heads, seq_len, full=False, alibi_bias_max=8, device=None, dtype=None):
-    alibi_bias = torch.arange(1 - seq_len, 1, dtype=dtype, device=device).view(1, 1, 1, seq_len)
-    if full:
-        # generate 1 x Heads x SeqLen x SeqLen alibi bias mask
-        # otherwise the mask is 1 x Heads x 1 x SeqLen (which is broadcast to the appropriate size)
-        alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=dtype, device=device).view(1, 1, seq_len, 1)
-        alibi_bias = alibi_bias.abs().mul(-1)
-
-    m = torch.arange(1, n_heads + 1, dtype=dtype, device=device)
-    m = m.mul(alibi_bias_max / n_heads)
-    alibi_bias = alibi_bias * (1.0 / (2 ** m.view(1, n_heads, 1, 1)))
-    return alibi_bias
diff --git a/mllm/flamingo/mpt_redpajama/configuration_mosaic_gpt.py b/mllm/flamingo/mpt_redpajama/configuration_mosaic_gpt.py
deleted file mode 100644
index 26da546bcb0c2f6ce1fe82ab8ab8b3b3f001fee1..0000000000000000000000000000000000000000
--- a/mllm/flamingo/mpt_redpajama/configuration_mosaic_gpt.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# Copyright 2022 MosaicML Examples authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""A HuggingFace-style model configuration."""
-
-from typing import Optional, Tuple, Union
-
-from transformers import PretrainedConfig
-
-
-class MosaicGPTConfig(PretrainedConfig):
-    model_type = "mosaic_gpt"
-
-    def __init__(
-        self,
-        d_model: int = 2048,
-        n_heads: int = 16,
-        n_layers: int = 24,
-        mlp_ratio: int = 4,
-        max_seq_len: int = 2048,
-        vocab_size: int = 50368,
-        attn_pdrop: float = 0.0,
-        resid_pdrop: float = 0.0,
-        emb_pdrop: float = 0.0,
-        attn_impl: str = "triton",
-        attn_qk_ln: bool = False,
-        attn_clip_qkv: Optional[float] = None,
-        softmax_scale: Optional[float] = None,
-        prefix_lm: Optional[bool] = False,
-        attn_uses_sequence_id: Optional[bool] = False,
-        alibi: bool = False,
-        alibi_bias_max: int = 8,
-        init_device: str = "cpu",
-        logit_scale: Optional[Union[float, str]] = None,
-        no_bias: bool = False,
-        verbose: int = 0,
-        param_init_fn: str = "kaiming_normal_",
-        init_div_is_residual: Union[int, float, str, bool] = True,
-        init_std: float = 0.02,
-        emb_init_std: Optional[float] = None,
-        emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-        init_gain: float = 0,
-        fan_mode: str = "fan_in",
-        init_nonlinearity: str = "relu",
-        embedding_fraction: float = 1.0,
-        low_precision_layernorm: bool = True,
-        use_cache: bool = False,
-        **kwargs,
-    ):
-        """The MosaicGPT configuration class.
-
-        Args:
-            d_model (int): The size of the embedding dimension of the model.
-            n_heads (int): The number of attention heads.
-            n_layers (int): The number of layers in the model.
-            mlp_ratio (int): The ratio of the up/down scale in the MLP.
-            max_seq_len (int): The maximum sequence length of the model.
-            vocab_size (int): The size of the vocabulary.
-            attn_pdrop (float): The dropout probability for the attention layers.
-            resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
-            emb_pdrop (float): The dropout probability for the embedding layer.
-            attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
-            attn_qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
-            attn_clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
-                this value.
-            softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
-                use the default scale of ``1/sqrt(d_keys)``.
-            prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
-                extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
-                can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
-            attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
-                When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
-                which sub-sequence each token belongs to.
-                Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
-            alibi (bool): Whether to use the alibi bias instead of position embeddings.
-            alibi_bias_max (int): The maximum value of the alibi bias.
-            init_device (str): The device to use for parameter initialization.
-            logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
-            no_bias (bool): Whether to use bias in all layers.
-            verbose (int): The verbosity level. 0 is silent.
-            param_init_fn (str): The parameter initialization scheme to use. One of 'default_', 'baseline_', 'kaiming_uniform_',
-                'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or 'xavier_normal_'.
-            init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
-            init_std (float): The standard deviation of the normal distribution used to initialize the model,
-                if using the baseline_ parameter initialization scheme.
-            emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
-            emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
-                used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
-            init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
-            fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
-            init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
-            embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
-            low_precision_layernorm (bool): Whether to use low precision layer normalization.
-            use_cache (bool): Whether or not the model should return the last key/values attentions
-        """
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.mlp_ratio = mlp_ratio
-        self.max_seq_len = max_seq_len
-        self.vocab_size = vocab_size
-        self.attn_pdrop = attn_pdrop
-        self.resid_pdrop = resid_pdrop
-        self.emb_pdrop = emb_pdrop
-        self.attn_impl = attn_impl
-        self.attn_qk_ln = attn_qk_ln
-        self.attn_clip_qkv = attn_clip_qkv
-        self.softmax_scale = softmax_scale
-        self.prefix_lm = prefix_lm
-        self.attn_uses_sequence_id = attn_uses_sequence_id
-        self.alibi = alibi
-        self.alibi_bias_max = alibi_bias_max
-        self.init_device = init_device
-        self.logit_scale = logit_scale
-        self.no_bias = no_bias
-        self.verbose = verbose
-        self.param_init_fn = param_init_fn
-        self.init_div_is_residual = init_div_is_residual
-        self.init_std = init_std
-        self.emb_init_std = emb_init_std
-        self.emb_init_uniform_lim = emb_init_uniform_lim
-        self.init_std = init_std
-        self.init_gain = init_gain
-        self.fan_mode = fan_mode
-        self.init_nonlinearity = init_nonlinearity
-        self.embedding_fraction = embedding_fraction
-        self.low_precision_layernorm = low_precision_layernorm
-        self.use_cache = use_cache
-        if "name" in kwargs:
-            del kwargs["name"]
-        if "loss_fn" in kwargs:
-            del kwargs["loss_fn"]
-        super().__init__(**kwargs)
-
-        self._validate_config()
-
-    def _validate_config(self):
-        if self.d_model % self.n_heads != 0:
-            raise ValueError("d_model must be divisible by n_heads")
-        if any(prob < 0 or prob > 1 for prob in [self.attn_pdrop, self.resid_pdrop, self.emb_pdrop]):
-            raise ValueError("attn_pdrop, resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1")
-        if self.attn_impl not in ["torch", "flash", "triton"]:
-            raise ValueError(f"Unknown attn_impl={self.attn_impl}")
-        if self.prefix_lm and self.attn_impl not in ["torch", "triton"]:
-            raise NotImplementedError("prefix_lm only implemented with torch and triton attention.")
-        if self.alibi and self.attn_impl not in ["torch", "triton"]:
-            raise NotImplementedError("alibi only implemented with torch and triton attention.")
-        if self.attn_uses_sequence_id and self.attn_impl not in ["torch", "triton"]:
-            raise NotImplementedError("attn_uses_sequence_id only implemented with torch and triton attention.")
-        if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
-            raise ValueError("model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!")
-        if isinstance(self.logit_scale, str) and self.logit_scale != "inv_sqrt_d_model":
-            raise ValueError(f"{self.logit_scale=} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
diff --git a/mllm/flamingo/mpt_redpajama/gpt_blocks.py b/mllm/flamingo/mpt_redpajama/gpt_blocks.py
deleted file mode 100644
index a60e09edc86ec0580b49abac71ec5a3c3f9847d6..0000000000000000000000000000000000000000
--- a/mllm/flamingo/mpt_redpajama/gpt_blocks.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright 2022 MosaicML Examples authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""GPT Blocks used for the GPT Model."""
-
-from typing import Optional, Tuple
-
-import torch
-import torch.nn as nn
-
-from .attention import MultiheadAttention
-from .low_precision_layernorm import LPLayerNorm
-
-
-class GPTMLP(nn.Module):
-    def __init__(self, d_model: int, mlp_ratio: int, device: Optional[str] = None):
-        super().__init__()
-        self.mlp_up = nn.Linear(d_model, mlp_ratio * d_model, device=device)
-        self.mlp_act = nn.GELU()
-        self.mlp_down = nn.Linear(mlp_ratio * d_model, d_model, device=device)
-        self.mlp_down._is_residual = True  # type: ignore
-
-    def forward(self, x):
-        return self.mlp_down(self.mlp_act(self.mlp_up(x)))
-
-
-class GPTBlock(nn.Module):
-    def __init__(
-        self,
-        attn_impl: str,
-        d_model: int,
-        n_heads: int,
-        mlp_ratio: int,
-        attn_clip_qkv: Optional[float] = None,
-        attn_qk_ln: bool = False,
-        softmax_scale: Optional[float] = None,
-        attn_pdrop: float = 0.0,
-        alibi: bool = False,
-        resid_pdrop: float = 0.0,
-        low_precision_layernorm: bool = False,
-        device: Optional[str] = None,
-        **kwargs
-    ):
-        del kwargs  # unused, just to capture any extra args from the config
-        super().__init__()
-
-        layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
-
-        self.ln_1 = layernorm_class(d_model, device=device)
-        self.attn = MultiheadAttention(
-            attn_impl=attn_impl,
-            attn_clip_qkv=attn_clip_qkv,
-            attn_qk_ln=attn_qk_ln,
-            softmax_scale=softmax_scale,
-            attn_pdrop=attn_pdrop,
-            d_model=d_model,
-            n_heads=n_heads,
-            device=device,
-        )
-        self.ln_2 = layernorm_class(d_model, device=device)
-        self.mlp = GPTMLP(
-            d_model=d_model,
-            mlp_ratio=mlp_ratio,
-            device=device,
-        )
-        self.resid_attn_dropout = nn.Dropout(resid_pdrop)
-        self.resid_mlp_dropout = nn.Dropout(resid_pdrop)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attn_bias: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.ByteTensor] = None,
-        is_causal: bool = True,
-    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
-        a = self.ln_1(x)
-        b, _, past_key_value = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
-        x = x + self.resid_attn_dropout(b)
-        m = self.ln_2(x)
-        n = self.mlp(m)
-        x = x + self.resid_mlp_dropout(n)
-        return x, past_key_value
diff --git a/mllm/flamingo/mpt_redpajama/low_precision_layernorm.py b/mllm/flamingo/mpt_redpajama/low_precision_layernorm.py
deleted file mode 100644
index eb3d4b7b9c8f70ceb3af688cf65fb627a45154db..0000000000000000000000000000000000000000
--- a/mllm/flamingo/mpt_redpajama/low_precision_layernorm.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import torch
-import torch.nn.functional as F
-
-
-class LPLayerNorm(torch.nn.LayerNorm):
-    def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None):
-        super().__init__(
-            normalized_shape=normalized_shape,
-            eps=eps,
-            elementwise_affine=elementwise_affine,
-            device=device,
-            dtype=dtype,
-        )
-
-    def forward(self, x):
-        module_device = x.device
-        downcast_x = _cast_if_autocast_enabled(x)
-        downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
-        downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
-        with torch.autocast(enabled=False, device_type=module_device.type):
-            return F.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
-
-
-def _cast_if_autocast_enabled(tensor):
-    if torch.is_autocast_enabled():
-        if tensor.device.type == "cuda":
-            dtype = torch.get_autocast_gpu_dtype()
-        elif tensor.device.type == "cpu":
-            dtype = torch.get_autocast_cpu_dtype()
-        else:
-            raise NotImplementedError()
-        return tensor.to(dtype=dtype)
-    return tensor
diff --git a/mllm/flamingo/mpt_redpajama/mosaic_gpt.py b/mllm/flamingo/mpt_redpajama/mosaic_gpt.py
deleted file mode 100644
index cc69466d94e694637343554ee95a53075989a79e..0000000000000000000000000000000000000000
--- a/mllm/flamingo/mpt_redpajama/mosaic_gpt.py
+++ /dev/null
@@ -1,403 +0,0 @@
-# Copyright 2022 MosaicML Examples authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""A simple, flexible implementation of a GPT model.
-
-Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
-"""
-
-import math
-import warnings
-from typing import List, Optional, Tuple
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from transformers import PreTrainedModel
-from transformers.modeling_outputs import CausalLMOutputWithPast
-
-from mllm.flamingo.mpt.custom_embedding import SharedEmbedding
-
-from .attention import attn_bias as module_attn_bias
-from .attention import attn_bias_shape as module_attn_bias_shape
-from .configuration_mosaic_gpt import MosaicGPTConfig
-from .gpt_blocks import GPTBlock
-from .low_precision_layernorm import LPLayerNorm
-from .param_init_fns import MODEL_INIT_REGISTRY
-
-
-class MosaicGPT(PreTrainedModel):
-    config_class = MosaicGPTConfig
-    base_model_prefix = "mosaic_gpt"
-
-    def __init__(self, config: MosaicGPTConfig):
-        super().__init__(config)
-  
-        if config.attn_impl == "flash" and config.alibi:
-            raise RuntimeError("ALiBi is not supported with flash attention. Please use triton or torch.")
-
-        self.attn_impl = config.attn_impl
-        self.prefix_lm = config.prefix_lm
-        self.attn_uses_sequence_id = config.attn_uses_sequence_id
-        self.alibi = config.alibi
-        self.alibi_bias_max = config.alibi_bias_max
-
-        layernorm_class = LPLayerNorm if config.low_precision_layernorm else nn.LayerNorm
-
-        # CogView (https://arxiv.org/abs/2105.13290) and GLM-130B (https://arxiv.org/abs/2210.02414)
-        # both report this helping with stabilizing training
-        self.embedding_fraction = config.embedding_fraction
-
-        self.transformer = nn.ModuleDict({"wte": nn.Embedding(config.vocab_size, config.d_model, device=config.init_device)})
-        if not self.alibi:
-            self.transformer.update({"wpe": nn.Embedding(config.max_seq_len, config.d_model, device=config.init_device)})
-        self.transformer.update({"emb_drop": nn.Dropout(config.emb_pdrop)})
-        self.transformer.update({"blocks": nn.ModuleList([GPTBlock(device=config.init_device, **config.to_dict()) for _ in range(config.n_layers)])})
-        self.transformer.update({"ln_f": layernorm_class(config.d_model, device=config.init_device)})
-
-        for child in self.transformer.children():
-            if isinstance(child, torch.nn.ModuleList):
-                continue
-            if isinstance(child, torch.nn.Module):
-                child._fsdp_wrap = True
-
-        # enables scaling output logits; similar to a softmax "temperature"
-        # PaLM paper uses scale 1/sqrt(config.d_model)
-        self.logit_scale = None
-        if config.logit_scale is not None:
-            logit_scale = config.logit_scale
-            if isinstance(logit_scale, str):
-                if logit_scale == "inv_sqrt_d_model":
-                    logit_scale = 1 / math.sqrt(config.d_model)
-                else:
-                    raise ValueError(f"{logit_scale=} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
-            self.logit_scale = logit_scale
-
-        if config.init_device != "meta":
-            print(f'You are using {config.init_device=}, but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.')
-            self.apply(self.param_init_fn)
-
-        self.is_causal = not self.prefix_lm
-
-        # define attn mask
-        self._attn_bias_initialized = False
-        self.attn_bias = None
-        self.attn_bias_shape = module_attn_bias_shape(
-            self.attn_impl,
-            config.n_heads,
-            config.max_seq_len,
-            self.alibi,
-            prefix_lm=self.prefix_lm,
-            causal=self.is_causal,
-            use_sequence_id=self.attn_uses_sequence_id,
-        )
-
-        if config.no_bias:
-            for module in self.modules():
-                if hasattr(module, "bias") and isinstance(module.bias, nn.Parameter):
-                    if config.verbose:
-                        print(f"Removing bias ({module.bias}) from {module}.")
-                    module.register_parameter("bias", None)
-
-        if config.verbose and config.verbose > 2:
-            print(self)
-
-    @torch.no_grad()
-    def _attn_bias(
-        self,
-        device,
-        dtype,
-        attention_mask: Optional[torch.ByteTensor] = None,
-        prefix_mask: Optional[torch.ByteTensor] = None,
-        sequence_id: Optional[torch.LongTensor] = None,
-    ):
-        if not self._attn_bias_initialized:
-            if self.attn_bias_shape:
-                self.attn_bias = torch.zeros(self.attn_bias_shape, device=device, dtype=dtype)
-                self.attn_bias = module_attn_bias(
-                    self.attn_impl,
-                    self.attn_bias,
-                    self.config.n_heads,
-                    self.config.max_seq_len,
-                    causal=self.is_causal,
-                    alibi=self.alibi,
-                    alibi_bias_max=self.alibi_bias_max,
-                )
-            self._attn_bias_initialized = True
-
-        # flash does not support prefix_lm and will incorporate any
-        # attention_mask inside the attention module
-        if self.attn_impl == "flash":
-            return self.attn_bias, attention_mask
-
-        attn_bias = self.attn_bias
-
-        # If using torch or triton, we incorporate the prefix_mask (if appropriate)
-        if self.prefix_lm:
-            assert isinstance(attn_bias, torch.Tensor)  # pyright
-            assert isinstance(prefix_mask, torch.Tensor)  # pyright
-            attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask)
-
-        # If using torch or triton, we incorporate sequence_id (if appropriate)
-        if self.attn_uses_sequence_id and sequence_id is not None:
-            assert isinstance(attn_bias, torch.Tensor)  # pyright
-            attn_bias = self._apply_sequence_id(attn_bias, sequence_id)
-
-        # If using torch or triton, we incorporate attention_mask. This will output
-        # None in place of attention_mask since it will not be further needed in the
-        # attention modules.
-        if attention_mask is not None:
-            s_k = attention_mask.shape[-1]
-            if attn_bias is None:
-                attn_bias = torch.zeros((1, 1, 1, s_k), device=device, dtype=dtype)
-            else:
-                attn_bias = attn_bias[:, :, :, -s_k:]
-            if prefix_mask is not None and (attention_mask.shape != prefix_mask.shape):
-                raise ValueError(f"attention_mask shape={attention_mask.shape} " + f"and prefix_mask shape={prefix_mask.shape} are not equal.")
-            min_val = torch.finfo(attn_bias.dtype).min
-            attn_bias = attn_bias.masked_fill(~attention_mask.view(-1, 1, 1, s_k), min_val)
-
-        return attn_bias, None
-
-    def _apply_prefix_mask(self, attn_bias: torch.Tensor, prefix_mask: torch.Tensor):
-        s_k, s_q = attn_bias.shape[-2:]
-        if (s_k != self.config.max_seq_len) or (s_q != self.config.max_seq_len):
-            raise ValueError(
-                "attn_bias does not match the expected shape. "
-                + f"The last two dimensions should both be {self.config.max_length} "
-                + f"but are {s_k} and {s_q}."
-            )
-        seq_len = prefix_mask.shape[-1]
-        if seq_len > self.config.max_seq_len:
-            raise ValueError(f"prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}")
-
-        # select seq_len subset of attn mask
-        attn_bias = attn_bias[..., :seq_len, :seq_len]
-
-        # Mix the causal max and the bidirectional mask to get the full
-        # allowable attention (i.e. full = not accounting for padding yet)
-        causal = torch.tril(torch.ones((seq_len, seq_len), dtype=torch.bool, device=prefix_mask.device)).view(1, 1, seq_len, seq_len)
-        prefix = prefix_mask.view(-1, 1, 1, seq_len)
-        cannot_attend = ~torch.logical_or(causal, prefix.bool())
-
-        min_val = torch.finfo(attn_bias.dtype).min
-        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
-
-        return attn_bias
-
-    def _apply_sequence_id(self, attn_bias: torch.Tensor, sequence_id: torch.LongTensor):
-        seq_len = sequence_id.shape[-1]
-        if seq_len > self.config.max_seq_len:
-            raise ValueError(f"sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}")
-
-        # select seq_len subset of attn mask
-        attn_bias = attn_bias[..., :seq_len, :seq_len]
-
-        # Restrict attention to tokens that share the same value
-        # in sequence_id
-        cannot_attend = torch.logical_not(torch.eq(sequence_id.view(-1, seq_len, 1), sequence_id.view(-1, 1, seq_len))).unsqueeze(1)
-        min_val = torch.finfo(attn_bias.dtype).min
-        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
-
-        return attn_bias
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
-        attention_mask: Optional[torch.ByteTensor] = None,
-        prefix_mask: Optional[torch.ByteTensor] = None,
-        sequence_id: Optional[torch.LongTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        return_dict: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        use_cache: Optional[bool] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-    ):
-        return_dict = return_dict if return_dict is not None else self.config.return_dict
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        # These args are passed in by keyword in huggingface's generate function
-        # https://github.com/huggingface/transformers/blob/68287689f2f0d8b7063c400230b3766987abf18d/src/transformers/generation/utils.py#L2201-L2206
-        # but have not yet been fully implemented in MosaicGPT
-        if not return_dict:
-            raise NotImplementedError("return_dict False is not implemented yet for MosaicGPT")
-        if output_attentions:
-            raise NotImplementedError("output_attentions is not implemented yet for MosaicGPT")
-
-        if attention_mask is not None:
-            attention_mask = attention_mask.bool()
-
-        if prefix_mask is not None:
-            prefix_mask = prefix_mask.bool()
-
-        if attention_mask is not None and attention_mask[:, 0].sum() != attention_mask.shape[0] and self.training:
-            raise NotImplementedError("MosaicGPT does not support training with left padding.")
-
-        if self.prefix_lm and prefix_mask is None:
-            raise ValueError("prefix_mask is a required argument when MosaicGPT is configured with prefix_lm=True.")
-
-        if self.training:
-            if self.attn_uses_sequence_id and sequence_id is None:
-                raise ValueError(
-                    "sequence_id is a required argument when MosaicGPT is configured with attn_uses_sequence_id=True " + "and the model is in train mode."
-                )
-            elif (self.attn_uses_sequence_id is False) and (sequence_id is not None):
-                warnings.warn(
-                    "MosaicGPT received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. "
-                    + "This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True."
-                )
-
-        S = input_ids.size(1)
-
-        assert S <= self.config.max_seq_len, f"Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}"
-
-        tok_emb = self.transformer.wte(input_ids)  # type: ignore
-        if self.alibi:
-            x = tok_emb
-        else:
-            past_position = 0
-            if past_key_values is not None:
-                if len(past_key_values) != self.config.n_layers:
-                    raise ValueError(
-                        f"past_key_values must provide a past_key_value for each attention "
-                        + f"layer in the network ({len(past_key_values)=}; {self.config.n_layers=})."
-                    )
-                # get the key tensor whose spec should be (batch, seq, dim), and
-                # collect the `seq`, so that the position embedding is shifted
-                past_position = past_key_values[0][0].size(1)
-
-            if S + past_position > self.config.max_seq_len:
-                raise ValueError(
-                    f"Cannot forward input with past sequence length {past_position} and current sequence length "
-                    f"{S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}."
-                )
-            pos = torch.arange(past_position, S + past_position, dtype=torch.long, device=input_ids.device).unsqueeze(0)
-            if attention_mask is not None:
-                # adjust the position indices to account for padding tokens
-                pos = torch.clamp(pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[:, past_position:], min=0)
-
-            pos_emb = self.transformer.wpe(pos)  # type: ignore
-            x = tok_emb + pos_emb
-
-        if self.embedding_fraction == 1:
-            x = self.transformer.emb_drop(x)  # type: ignore
-        else:
-            # this implementation is proposed on page 7 of the GLM-130B paper https://arxiv.org/abs/2210.02414
-            x_shrunk = (x * self.embedding_fraction) + (x.detach() * (1 - self.embedding_fraction))
-            assert isinstance(self.transformer.emb_drop, nn.Module)  # pyright
-            x = self.transformer.emb_drop(x_shrunk)
-
-        attn_bias, attention_mask = self._attn_bias(
-            device=x.device, dtype=x.dtype, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id
-        )
-
-        # initialize the past key values cache if it should be used
-        if use_cache and past_key_values is None:
-            past_key_values = [() for _ in range(self.config.n_layers)]  # type: ignore
-
-        all_hidden_states = () if output_hidden_states else None
-        for b_idx, block in enumerate(self.transformer.blocks):  # type: ignore
-            if output_hidden_states:
-                assert all_hidden_states is not None  # pyright
-                all_hidden_states = all_hidden_states + (x,)
-            past_key_value = past_key_values[b_idx] if past_key_values is not None else None
-            x, past_key_value = block(x, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=self.is_causal)
-            if past_key_values is not None:
-                past_key_values[b_idx] = past_key_value
-
-        x = self.transformer.ln_f(x)  # type: ignore
-
-        # output embedding weight tied to input embedding
-        # move outputs to same device as weights for token embedding
-        # needed to support HF `device_map`
-        assert isinstance(self.transformer.wte, nn.Module)  # pyright
-        assert isinstance(self.transformer.wte.weight, torch.Tensor)  # pyright
-        logits = F.linear(x.to(self.transformer.wte.weight.device), self.transformer.wte.weight, None)
-
-        if self.logit_scale is not None:
-            if self.logit_scale == 0:
-                warnings.warn(f"Multiplying logits by {self.logit_scale=}. This will produce uniform (uninformative) outputs.")
-            logits *= self.logit_scale
-
-        loss = None
-        if labels is not None:
-            _labels = torch.roll(labels, shifts=-1)
-            _labels[:, -1] = -100
-            loss = F.cross_entropy(
-                logits.view(-1, logits.size(-1)),
-                _labels.to(logits.device).view(-1),
-            )
-
-        return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=past_key_values, hidden_states=all_hidden_states)
-
-    # Param Initialization, needed for device='meta' fast initialization
-    def param_init_fn(self, module):
-        init_fn_name = self.config.param_init_fn
-        if self.config.verbose > 1:
-            warnings.warn(f"Using {init_fn_name} initialization.")
-        MODEL_INIT_REGISTRY[init_fn_name](module=module, **self.config.to_dict())
-
-    # FSDP Wrap function
-    def fsdp_wrap_fn(self, module):
-        return isinstance(module, GPTBlock)
-
-    # Activation Checkpointing
-    def activation_checkpointing_fn(self, module):
-        return isinstance(module, GPTBlock)
-
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
-        if inputs_embeds is not None:
-            raise NotImplementedError("inputs_embeds is not implemented for MosaicGPT yet")
-
-        attention_mask = kwargs["attention_mask"].bool()
-        if attention_mask[:, -1].sum() != attention_mask.shape[0]:
-            raise NotImplementedError("MosaicGPT does not support generation with right padding.")
-
-        if self.attn_uses_sequence_id and self.training:
-            sequence_id = torch.zeros_like(input_ids[:1])
-        else:
-            sequence_id = None
-
-        if past_key_values is not None:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-
-        if self.prefix_lm:
-            # Leverage a convenience of sequential generation!
-            prefix_mask = torch.ones_like(attention_mask)
-            # This requires that we're using the cache
-            if kwargs.get("use_cache") == False:
-                raise NotImplementedError("MosaicGPT with prefix_lm=True does not support use_cache=False.")
-        else:
-            prefix_mask = None
-
-        return {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "prefix_mask": prefix_mask,
-            "sequence_id": sequence_id,
-            "past_key_values": past_key_values,
-            "use_cache": kwargs.get("use_cache", True),
-        }
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        """Used by HuggingFace generate when using beam search with kv-caching.
-
-        See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133
-        for an example in transformers.
-        """
-        reordered_past = []
-        for layer_past in past_key_values:
-            reordered_past += [tuple(past_state.index_select(0, beam_idx) for past_state in layer_past)]
-        return reordered_past
-
-    def get_input_embeddings(self):
-        return self.transformer.wte
-
-    def set_input_embeddings(self, new_embeddings):
-        self.transformer.wte = new_embeddings.device(self.transformer.wte.weight.device)
-
-    def get_decoder(self):
-        return self.transformer
diff --git a/mllm/flamingo/mpt_redpajama/param_init_fns.py b/mllm/flamingo/mpt_redpajama/param_init_fns.py
deleted file mode 100644
index f897b2c2fb2628d7b53c9176b7018bf93f171394..0000000000000000000000000000000000000000
--- a/mllm/flamingo/mpt_redpajama/param_init_fns.py
+++ /dev/null
@@ -1,425 +0,0 @@
-# Copyright 2022 MosaicML Examples authors
-# SPDX-License-Identifier: Apache-2.0
-import math
-import warnings
-from collections.abc import Sequence
-from functools import partial
-from typing import Optional, Tuple, Union
-
-import torch
-from torch import nn
-
-
-def torch_default_param_init_fn_(
-    module: nn.Module,
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs  # unused, just to capture any extra args from the config
-    if verbose > 1:
-        warnings.warn(f"Initializing network using module's reset_parameters attribute")
-
-    if hasattr(module, "reset_parameters"):
-        module.reset_parameters()  # type: ignore
-
-
-def fused_init_helper_(module: nn.Module, init_fn_):
-    # parameter initialization is often based on the parameters shape.
-    # If a layer is fused, initialization should be based on the shapes
-    # of the original tensor instead of the shape of the fused tensor.
-    # Layers which are fused should have the _fused attibute defined.
-    # The first element of _fused is the dimension along which the tensor is fused.
-    # This is followed by an iterable of split indices."
-
-    _fused = getattr(module, "_fused", None)
-
-    if _fused is None:
-        raise RuntimeError(f"Internal logic error")
-
-    dim, splits = _fused
-    splits = (0, *splits, module.weight.size(dim))  # type: ignore
-    for s, e in zip(splits[:-1], splits[1:]):
-        slice_indices = [slice(None)] * module.weight.ndim  # type: ignore
-        slice_indices[dim] = slice(s, e)
-        init_fn_(module.weight[slice_indices])  # type: ignore
-
-
-def generic_param_init_fn_(
-    module: nn.Module,
-    init_fn_,
-    n_layers: int,
-    d_model: Optional[int] = None,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs  # unused, just to capture any extra args from the config
-    if verbose > 1:
-        warnings.warn(f"If model has bias parameters they are initialized to 0.")
-
-    # enable user to divide _is_residual weights by
-    # a value which defaults to math.sqrt(2 * cfg.n_layers)
-    init_div_is_residual = init_div_is_residual
-
-    if init_div_is_residual is False:
-        # not used, for pyright
-        div_is_residual = 1.0
-    elif init_div_is_residual is True:
-        div_is_residual = math.sqrt(2 * n_layers)
-    elif isinstance(init_div_is_residual, float) or isinstance(init_div_is_residual, int):
-        div_is_residual = init_div_is_residual
-    elif isinstance(init_div_is_residual, str) and init_div_is_residual.isnumeric():
-        # do not trust YAML parsing to always convert numbers to numbers
-        div_is_residual = float(init_div_is_residual)
-    else:
-        # not used, for pyright
-        div_is_residual = 1.0
-        raise ValueError(f"Expected init_div_is_residual to be boolean or numeric, got {init_div_is_residual}")
-
-    if init_div_is_residual is not False:
-        if verbose > 1:
-            warnings.warn(
-                f"Initializing _is_residual layers then dividing them by {div_is_residual}."
-                + f"set `init_div_is_residual: false` in model config to disable this."
-            )
-
-    if isinstance(module, nn.Linear):
-        # Linear
-        if hasattr(module, "_fused"):
-            fused_init_helper_(module, init_fn_)
-        else:
-            init_fn_(module.weight)
-        if module.bias is not None:
-            torch.nn.init.zeros_(module.bias)
-
-        if init_div_is_residual is not False and getattr(module, "_is_residual", False):
-            with torch.no_grad():
-                module.weight.div_(div_is_residual)
-
-    elif isinstance(module, nn.Embedding):
-        # Embedding
-        if emb_init_std is not None:
-            std = emb_init_std
-            if std == 0:
-                warnings.warn(f"Embedding layer initialized to 0.")
-            emb_init_fn_ = partial(torch.nn.init.normal_, mean=0.0, std=std)
-            if verbose > 1:
-                warnings.warn(f"Embedding layer initialized using normal distribution with mean=0 and {std=}.")
-        elif emb_init_uniform_lim is not None:
-            lim = emb_init_uniform_lim
-            if isinstance(lim, Sequence):
-                if len(lim) > 2:
-                    raise ValueError(f"Uniform init requires a min and a max limit. User input: {lim}.")
-                if lim[0] == lim[1]:
-                    warnings.warn(f"Embedding layer initialized to {lim[0]}.")
-            else:
-                if lim == 0:
-                    warnings.warn(f"Embedding layer initialized to 0.")
-                lim = [-lim, lim]
-            a, b = lim
-            emb_init_fn_ = partial(torch.nn.init.uniform_, a=a, b=b)
-            if verbose > 1:
-                warnings.warn(f"Embedding layer initialized using uniform distribution in range {lim}.")
-        else:
-            emb_init_fn_ = init_fn_
-
-        emb_init_fn_(module.weight)
-
-    elif isinstance(module, nn.LayerNorm):
-        # LayerNorm
-        if verbose > 1:
-            warnings.warn(f"LayerNorm gamma weights are set to 1. If the layer has a bias it is initialized to 0.")
-        torch.nn.init.ones_(module.weight)
-        if module.bias is not None:
-            torch.nn.init.zeros_(module.bias)
-
-    elif isinstance(module, nn.MultiheadAttention):
-        # torch's MultiheadAttention
-        if module._qkv_same_embed_dim:
-            assert module.in_proj_weight is not None
-            assert module.q_proj_weight is None and module.k_proj_weight is None and module.v_proj_weight is None
-            assert d_model is not None
-            # in_proj_weight is actually 3 layers and should be split up for width based init
-            _d = d_model
-            splits = (0, _d, 2 * _d, 3 * _d)
-            for s, e in zip(splits[:-1], splits[1:]):
-                init_fn_(module.in_proj_weight[s:e])
-        else:
-            assert module.q_proj_weight is not None and module.k_proj_weight is not None and module.v_proj_weight is not None
-            assert module.in_proj_weight is None
-            init_fn_(module.q_proj_weight)
-            init_fn_(module.k_proj_weight)
-            init_fn_(module.v_proj_weight)
-
-        # bias
-        if module.in_proj_bias is not None:
-            torch.nn.init.zeros_(module.in_proj_bias)
-        if module.bias_k is not None:
-            torch.nn.init.zeros_(module.bias_k)
-        if module.bias_v is not None:
-            torch.nn.init.zeros_(module.bias_v)
-
-        # out proj
-        init_fn_(module.out_proj.weight)
-        if init_div_is_residual is not False and getattr(module.out_proj, "_is_residual", False):
-            with torch.no_grad():
-                module.out_proj.weight.div_(div_is_residual)
-        if module.out_proj.bias is not None:
-            torch.nn.init.zeros_(module.out_proj.bias)
-
-    else:
-        for _ in module.parameters(recurse=False):
-            # raise error if uninitialized module has any parameters
-            raise NotImplementedError(f"{module.__class__.__name__} parameters are not initialized by param_init_fn.")
-
-
-def _normal_init_(std, mean=0.0):
-    return partial(torch.nn.init.normal_, mean=mean, std=std)
-
-
-def _normal_param_init_fn_(
-    module: nn.Module,
-    std: float,
-    n_layers: int,
-    d_model: Optional[int] = None,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs  # unused, just to capture any extra args from the config
-    init_fn_ = _normal_init_(std=std)
-
-    if verbose > 1:
-        warnings.warn(f"Using torch.nn.init.normal_ init fn mean=0.0, std={std}")
-
-    generic_param_init_fn_(
-        module=module,
-        init_fn_=init_fn_,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=init_div_is_residual,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-
-
-def baseline_param_init_fn_(
-    module: nn.Module,
-    init_std: float,
-    n_layers: int,
-    d_model: Optional[int] = None,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs  # unused, just to capture any extra args from the config
-    if init_std is None:
-        raise ValueError("You must set model.init_std to a float value to use the default initialization scheme.")
-    _normal_param_init_fn_(
-        module=module,
-        std=init_std,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=init_div_is_residual,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-
-
-def small_param_init_fn_(
-    module: nn.Module,
-    n_layers: int,
-    d_model: int,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs  # unused, just to capture any extra args from the config
-    # very close to kaiming normal
-    # from Transformers without Tears (2019) - Nguyen & Salazar
-    std = math.sqrt(2 / (5 * d_model))
-    _normal_param_init_fn_(
-        module=module,
-        std=std,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=init_div_is_residual,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-
-
-def neox_param_init_fn_(
-    module: nn.Module,
-    n_layers: int,
-    d_model: int,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    verbose: int = 0,
-    **kwargs,
-):
-    """From section 2.3.1 of GPT-NeoX-20B:
-
-    An Open-Source AutoregressiveLanguage Model — Black et. al. (2022)
-    see https://github.com/EleutherAI/gpt-neox/blob/9610391ab319403cef079b438edd016a2443af54/megatron/model/init_functions.py#L151
-    and https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/transformer.py
-    """
-    del kwargs  # unused, just to capture any extra args from the config
-    residual_div = n_layers / math.sqrt(10)  # small std / wang std
-
-    if verbose > 1:
-        warnings.warn(f"setting init_div_is_residual to {residual_div}")
-
-    small_param_init_fn_(
-        module=module,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=residual_div,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-
-
-def kaiming_uniform_param_init_fn_(
-    module: nn.Module,
-    n_layers: int,
-    d_model: Optional[int] = None,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    init_gain: float = 0,
-    fan_mode: str = "fan_in",
-    init_nonlinearity: str = "leaky_relu",
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs  # unused, just to capture any extra args from the config
-
-    if verbose > 1:
-        warnings.warn(f"Using nn.init.kaiming_uniform_ init fn with parameters: " + f"a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}")
-
-    kaiming_uniform_ = partial(nn.init.kaiming_uniform_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
-
-    generic_param_init_fn_(
-        module=module,
-        init_fn_=kaiming_uniform_,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=init_div_is_residual,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-
-
-def kaiming_normal_param_init_fn_(
-    module: nn.Module,
-    n_layers: int,
-    d_model: Optional[int] = None,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    init_gain: float = 0,
-    fan_mode: str = "fan_in",
-    init_nonlinearity: str = "leaky_relu",
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs  # unused, just to capture any extra args from the config
-
-    if verbose > 1:
-        warnings.warn(f"Using nn.init.kaiming_normal_ init fn with parameters: " + f"a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}")
-
-    kaiming_normal_ = partial(torch.nn.init.kaiming_normal_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
-
-    generic_param_init_fn_(
-        module=module,
-        init_fn_=kaiming_normal_,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=init_div_is_residual,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-
-
-def xavier_uniform_param_init_fn_(
-    module: nn.Module,
-    n_layers: int,
-    d_model: Optional[int] = None,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    init_gain: float = 0,
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs  # unused, just to capture any extra args from the config
-    xavier_uniform_ = partial(torch.nn.init.xavier_uniform_, gain=init_gain)
-
-    if verbose > 1:
-        warnings.warn(f"Using torch.nn.init.xavier_uniform_ init fn with parameters: " + f"gain={init_gain}")
-
-    generic_param_init_fn_(
-        module=module,
-        init_fn_=xavier_uniform_,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=init_div_is_residual,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-
-
-def xavier_normal_param_init_fn_(
-    module: nn.Module,
-    n_layers: int,
-    d_model: Optional[int] = None,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    init_gain: float = 0,
-    verbose: int = 0,
-    **kwargs,
-):
-    xavier_normal_ = partial(torch.nn.init.xavier_normal_, gain=init_gain)
-
-    if verbose > 1:
-        warnings.warn(f"Using torch.nn.init.xavier_normal_ init fn with parameters: " + f"gain={init_gain}")
-
-    generic_param_init_fn_(
-        module=module,
-        init_fn_=xavier_normal_,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=init_div_is_residual,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-
-
-MODEL_INIT_REGISTRY = {
-    "default_": torch_default_param_init_fn_,
-    "baseline_": baseline_param_init_fn_,
-    "kaiming_uniform_": kaiming_uniform_param_init_fn_,
-    "kaiming_normal_": kaiming_normal_param_init_fn_,
-    "neox_init_": neox_param_init_fn_,
-    "small_init_": small_param_init_fn_,
-    "xavier_uniform_": xavier_uniform_param_init_fn_,
-    "xavier_normal_": xavier_normal_param_init_fn_,
-}
diff --git a/mllm/flamingo/utils.py b/mllm/flamingo/utils.py
deleted file mode 100644
index 2bfc7ee365a7cf77699ed2aae7bec38c65d9404d..0000000000000000000000000000000000000000
--- a/mllm/flamingo/utils.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import re
-import torch
-
-
-def rename_flamingo_checkpoint(old_ckpt: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
-    """Rename some keys in the public flamingo checkpoint"""
-    perceiver_pattern1 = re.compile(r"perceiver\.layers\.[0-9]\.0")
-    perceiver_pattern2 = re.compile(r"perceiver\.layers\.[0-9]\.1")
-    new_ckpt = old_ckpt.copy()
-    for key, value in old_ckpt.items():
-        if re.match(perceiver_pattern1, key):
-            new_key = re.sub(r"([0-9])\.0", r"\1", key)
-            new_ckpt.pop(key)
-            new_ckpt[new_key] = value
-        elif re.match(perceiver_pattern2, key):
-            new_key = re.sub(r"([0-9])\.1", r"\1.feed_forward", key)
-            new_ckpt.pop(key)
-            new_ckpt[new_key] = value
-        elif key.startswith("lang_encoder.gated_cross_attn_layers."):
-            new_ckpt.pop(key)
-        elif key.startswith("lang_encoder.") and "ff_gate" not in key:
-            new_key = key.replace("ff", "feed_forward")
-            new_ckpt.pop(key)
-            new_ckpt[new_key] = value
-    return new_ckpt
diff --git a/mllm/otter/Otter-MPT7B-config.json b/mllm/otter/Otter-MPT7B-config.json
deleted file mode 100644
index 5a34ede78faba5a2f8aeb3991a901898e7a2482d..0000000000000000000000000000000000000000
--- a/mllm/otter/Otter-MPT7B-config.json
+++ /dev/null
@@ -1,197 +0,0 @@
-{
-    "_commit_hash": null,
-    "_name_or_path": "/mnt/petrelfs/zhangyuanhan/weights/flamingo-mpt-7B",
-    "architectures": [
-        "FlamingoForConditionalGeneration"
-    ],
-    "cross_attn_every_n_layers": 4,
-    "model_type": "otter",
-    "only_attend_previous": true,
-    "text_config": {
-        "_name_or_path": "",
-        "add_cross_attention": false,
-        "architectures": [
-            "MPTForCausalLM"
-        ],
-        "attn_config": {
-            "alibi": true,
-            "alibi_bias_max": 8,
-            "attn_impl": "torch",
-            "attn_pdrop": 0,
-            "attn_type": "multihead_attention",
-            "attn_uses_sequence_id": false,
-            "clip_qkv": null,
-            "prefix_lm": false,
-            "qk_ln": false,
-            "softmax_scale": null
-        },
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "d_model": 4096,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "emb_pdrop": 0,
-        "embedding_fraction": 1.0,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "expansion_ratio": 4,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_size": 4096,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "init_config": {
-            "emb_init_std": null,
-            "emb_init_uniform_lim": null,
-            "fan_mode": "fan_in",
-            "init_div_is_residual": true,
-            "init_gain": 0,
-            "init_nonlinearity": "relu",
-            "init_std": 0.02,
-            "name": "kaiming_normal_",
-            "verbose": 0
-        },
-        "init_device": "cpu",
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "learned_pos_emb": true,
-        "length_penalty": 1.0,
-        "logit_scale": null,
-        "max_length": 20,
-        "max_seq_len": 2048,
-        "min_length": 0,
-        "model_type": "mpt",
-        "n_heads": 32,
-        "n_layers": 32,
-        "no_bias": true,
-        "no_repeat_ngram_size": 0,
-        "norm_type": "low_precision_layernorm",
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "prefix": null,
-        "problem_type": null,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "resid_pdrop": 0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "tokenizer_name": "EleutherAI/gpt-neox-20b",
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": "bfloat16",
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false,
-        "use_cache": false,
-        "verbose": 0,
-        "vocab_size": 50432
-    },
-    "torch_dtype": "float32",
-    "transformers_version": null,
-    "use_media_placement_augmentation": true,
-    "vision_config": {
-        "_name_or_path": "openai/clip-vit-large-patch14",
-        "add_cross_attention": false,
-        "architectures": null,
-        "attention_dropout": 0.0,
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_act": "quick_gelu",
-        "hidden_size": 1024,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "image_size": 224,
-        "initializer_factor": 1.0,
-        "initializer_range": 0.02,
-        "intermediate_size": 4096,
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "layer_norm_eps": 1e-05,
-        "length_penalty": 1.0,
-        "max_length": 20,
-        "min_length": 0,
-        "model_type": "clip_vision_model",
-        "no_repeat_ngram_size": 0,
-        "num_attention_heads": 16,
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_channels": 3,
-        "num_hidden_layers": 24,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "patch_size": 14,
-        "prefix": null,
-        "problem_type": null,
-        "projection_dim": 512,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": null,
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false
-    }
-}
\ No newline at end of file
diff --git a/mllm/otter/__init__.py b/mllm/otter/__init__.py
deleted file mode 100644
index b6e68928148e17b1c0f98883e95cadfcba493b10..0000000000000000000000000000000000000000
--- a/mllm/otter/__init__.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from typing import TYPE_CHECKING
-
-from transformers.utils import (
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    is_torch_available,
-)
-
-
-_import_structure = {
-    "configuration_otter": [
-        "OtterConfig",
-    ],
-}
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_otter"] = [
-        "OtterModel",
-        "OtterPreTrainedModel",
-        "OtterForConditionalGeneration",
-    ]
-
-if TYPE_CHECKING:
-    from .configuration_otter import OtterConfig
-
-    # from .processing_otter import OtterProcessor
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_otter import (
-            OtterForConditionalGeneration,
-            OtterModel,
-            OtterPreTrainedModel,
-        )
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/mllm/otter/config.json b/mllm/otter/config.json
deleted file mode 100644
index 9e2288719c3fce260a10cafdb29faf195a349806..0000000000000000000000000000000000000000
--- a/mllm/otter/config.json
+++ /dev/null
@@ -1,21 +0,0 @@
-{
-    "model_type": "otter",
-    "cross_attn_every_n_layers": 4,
-    "tie_word_embeddings": false,
-    "use_media_placement_augmentation": true,
-    "only_attend_previous": true,
-    "text_config": {
-        "_name_or_path": "luodian/llama-7b-hf",
-        "model_type": "llama"
-    },
-    "vision_config": {
-        "_name_or_path": "openai/clip-vit-large-patch14",
-        "model_type": "clip_vision_model",
-        "hidden_size": 1024,
-        "intermediate_size": 4096,
-        "num_attention_heads": 16,
-        "num_hidden_layers": 24,
-        "image_size": 224,
-        "patch_size": 14
-    }
-}
\ No newline at end of file
diff --git a/mllm/otter/configuration_otter.py b/mllm/otter/configuration_otter.py
deleted file mode 100644
index ef28057d6b323717f66e095022c59cd4a119d88d..0000000000000000000000000000000000000000
--- a/mllm/otter/configuration_otter.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import copy
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-from transformers.models.auto import CONFIG_MAPPING
-from transformers.models.clip import CLIPVisionConfig
-
-from mllm.flamingo.falcon.configuration_RW import RWConfig
-from mllm.flamingo.mpt.configuration_mpt import MPTConfig
-from mllm.flamingo.mpt_redpajama.configuration_mosaic_gpt import MosaicGPTConfig
-
-logger = logging.get_logger(__name__)
-
-
-class OtterConfig(PretrainedConfig):
-    r"""
-    [`OtterConfig`] is the configuration class to store the configuration of a [`OtterForConditionalGeneration`]. It is
-    used to instantiate a Otter model according to the specified arguments, defining the vision model and language model configs. Instantiating a configuration with the defaults will yield a similar configuration to
-    that of the Otter architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        vision_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`PretrainedConfig`].
-        text_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
-        cross_attn_every_n_layers (`int`, *optional*, defaults to 4):
-            The number of cross-attention layers adding after each transformer layer.
-
-        kwargs (*optional*):
-            Dictionary of keyword arguments.
-
-    Example:
-
-    ```python
-    >>> from transformers import (
-    ...     PretrainedConfig,
-    ...     OPTConfig,
-    ...     OtterConfig,
-    ...     OtterForConditionalGeneration,
-    ... )
-
-    >>> # Initializing a OtterConfig with luodian/otter-9b-hf style configuration
-    >>> configuration = OtterConfig()
-
-    >>> # Initializing a OtterForConditionalGeneration (with random weights) from the Salesforce/Otter-opt-2.7b style configuration
-    >>> model = OtterForConditionalGeneration(configuration)
-    ```"""
-    model_type = "otter"
-    is_composition = True
-
-    def __init__(self, vision_config=None, text_config=None, cross_attn_every_n_layers: int = 4, use_media_placement_augmentation: bool = True, **kwargs):
-        super().__init__(**kwargs)
-
-        if vision_config is None:
-            vision_config = {}
-            logger.info("vision_config is None. initializing the vision config with default values.")
-
-        if text_config is None:
-            text_config = {}
-            logger.info("text_config is None. Initializing the text config with default values.")
-
-        self.vision_config = CLIPVisionConfig(**vision_config)
-        if "architectures" in text_config.keys() and text_config["architectures"] != None:
-            if text_config["architectures"][0] == "MPTForCausalLM":
-                self.text_config = MPTConfig(**text_config)
-            elif text_config["architectures"][0] == "MosaicGPT":
-                self.text_config = MosaicGPTConfig(**text_config)
-            elif text_config["architectures"][0] == "RWForCausalLM":
-                self.text_config = RWConfig(**text_config)
-            elif text_config["architectures"][0] == "LlamaForCausalLM":
-                self.text_config = CONFIG_MAPPING[text_config.pop("model_type")](**text_config)
-            else:
-                import pdb
-
-                pdb.set_trace()
-        else:
-            self.text_config = CONFIG_MAPPING[text_config.pop("model_type")](**text_config)
-        self.cross_attn_every_n_layers = cross_attn_every_n_layers
-        self.use_media_placement_augmentation = use_media_placement_augmentation
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["vision_config"] = self.vision_config.to_dict()
-        output["text_config"] = self.text_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        output["cross_attn_every_n_layers"] = self.cross_attn_every_n_layers
-        output["use_media_placement_augmentation"] = self.use_media_placement_augmentation
-        return output
diff --git a/mllm/otter/converting_flamingo_to_otter.py b/mllm/otter/converting_flamingo_to_otter.py
deleted file mode 100644
index 114688ae1c219e4a51858b0adc9c643e96b3f177..0000000000000000000000000000000000000000
--- a/mllm/otter/converting_flamingo_to_otter.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# This script is used to convert the huggingface format Open-Flamingo model to the Otter model.
-# You can use it in parent folder by running: python -m models.otter.converting_flamingo_to_otter --checkpoint_path <path_to_flamingo_checkpoint> --save_path <path_to_save_otter_checkpoint>
-import argparse
-import torch
-from otter_ai.models.otter.modeling_otter import OtterForConditionalGeneration
-from otter_ai.models.flamingo.modeling_flamingo import FlamingoForConditionalGeneration
-
-# Define argument parser
-parser = argparse.ArgumentParser(description="Load a model with specified precision and save it to a specified path.")
-
-# Add arguments
-parser.add_argument("--checkpoint_path", type=str, required=True, help="Path to the pre-trained Open-Flamingo model checkpoint.")
-parser.add_argument("--save_path", type=str, default=None, help="Path to the converted Otter model checkpoint.")
-
-# Parse the input arguments
-args = parser.parse_args()
-
-# Load the model
-model = FlamingoForConditionalGeneration.from_pretrained(args.checkpoint_path, device_map="auto")
-model.text_tokenizer.add_special_tokens({"additional_special_tokens": ["<|endofchunk|>", "<image>", "<answer>"]})
-if model.lang_encoder.__class__.__name__ == "LlamaForCausalLM":
-    model.lang_encoder.resize_token_embeddings(len(model.text_tokenizer))
-
-# Save the model
-checkpoint_path = args.save_path
-OtterForConditionalGeneration.save_pretrained(model, checkpoint_path)
diff --git a/mllm/otter/converting_otter_fp32_to_fp16.py b/mllm/otter/converting_otter_fp32_to_fp16.py
deleted file mode 100644
index 373b8b53b11b946f54acea5487d46b734d688fbc..0000000000000000000000000000000000000000
--- a/mllm/otter/converting_otter_fp32_to_fp16.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import argparse
-import torch
-from otter_ai.models.otter.modeling_otter import OtterForConditionalGeneration
-
-# Define argument parser
-parser = argparse.ArgumentParser(description="Load a model with specified precision and save it to a specified path.")
-
-# Add arguments
-parser.add_argument(
-    "--load_bit",
-    type=str,
-    choices=["fp16", "bf16"],
-    default="fp16",
-    help="Precision of the loaded model. Either 'fp16' or 'bf16'. Default is 'fp16'.",
-)
-parser.add_argument("--checkpoint_path", type=str, required=True, help="Path to the pre-trained model checkpoint.")
-parser.add_argument("--save_path", type=str, default=None, help="Path to the converted model checkpoint.")
-
-# Parse the input arguments
-args = parser.parse_args()
-
-# Set precision based on load_bit argument
-if args.load_bit == "fp16":
-    precision = {"torch_dtype": torch.float16}
-elif args.load_bit == "bf16":
-    precision = {"torch_dtype": torch.bfloat16}
-
-# Load the model
-model = OtterForConditionalGeneration.from_pretrained(args.checkpoint_path, device_map="auto", **precision)
-
-# Save the model
-if args.save_path is None:
-    checkpoint_path = args.checkpoint_path + f"-{args.load_bit}"
-else:
-    checkpoint_path = args.save_path
-OtterForConditionalGeneration.save_pretrained(model, checkpoint_path)
diff --git a/mllm/otter/converting_otter_pt_to_hf.py b/mllm/otter/converting_otter_pt_to_hf.py
deleted file mode 100644
index eb0acd5b16780e21389f131aa36d0bbcbd5ed523..0000000000000000000000000000000000000000
--- a/mllm/otter/converting_otter_pt_to_hf.py
+++ /dev/null
@@ -1,63 +0,0 @@
-"""convert from otter pt to otter hf. Will remove after we use otter hf model to train.
-"""
-
-import argparse
-import os
-
-import torch
-
-from modeling_otter import OtterForConditionalGeneration
-
-
-# The function is to inject newly trained otter perceiver parameters into the pretrained otter init model.
-@torch.no_grad()
-def dump_hf_model(pretrained_model_path: str, old_ckpt_path: str, new_folder_path: str) -> None:
-    old_ckpt = torch.load(old_ckpt_path, map_location="cpu")
-    if old_ckpt.get("model_state_dict", None) is not None:
-        old_ckpt = old_ckpt["model_state_dict"]
-    new_ckpt = old_ckpt
-    # folder_path = os.path.dirname(old_ckpt_path)
-    # config_path = os.path.join(folder_path, "config.json") if os.path.exists(os.path.join(folder_path, "config.json")) else "otter/config.json"
-    model = OtterForConditionalGeneration.from_pretrained(
-        args.pretrained_model_path,
-        device_map="auto",
-    )
-
-    if "flamingo" in args.pretrained_model_path:
-        model.text_tokenizer.add_special_tokens({"additional_special_tokens": ["<answer>"]})
-        if "LlamaForCausalLM" in model.lang_encoder.__class__.__name__:
-            model.lang_encoder.resize_token_embeddings(len(model.text_tokenizer))
-
-    _ = model.load_state_dict(new_ckpt, strict=False)
-    print(f"Saving HF model to {new_folder_path}")
-    model.save_pretrained(new_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--old_ckpt_path",
-        "-old",
-        type=str,
-        required=True,
-        help="Path to the pt checkpoint",
-    )
-    parser.add_argument(
-        "--new_hf_path",
-        "-new",
-        type=str,
-        required=True,
-        help="Path to the hf folder",
-    )
-    parser.add_argument(
-        "--pretrained_model_path",
-        "-pretrained",
-        type=str,
-        default="luodian/OTTER-MPT7B-Init",
-        required=True,
-        help="Path to the pretrained model folder.",
-    )
-    args = parser.parse_args()
-    if not os.path.exists(os.path.dirname(args.new_hf_path)):
-        os.makedirs(os.path.dirname(args.new_hf_path))
-    dump_hf_model(args.pretrained_model_path, args.old_ckpt_path, args.new_hf_path)
diff --git a/mllm/otter/converting_otter_to_lora.py b/mllm/otter/converting_otter_to_lora.py
deleted file mode 100755
index 972807e098a3b3bb2e217cf3e31888a654d27fa3..0000000000000000000000000000000000000000
--- a/mllm/otter/converting_otter_to_lora.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import argparse
-import torch
-import sys
-
-from .modeling_otter import OtterForConditionalGeneration
-from peft import get_peft_model, LoraConfig, TaskType
-
-MODEL_CLASSES = {
-    "LlamaForCausalLM": "llama",
-    "OPTForCausalLM": "opt",
-    "GPTJForCausalLM": "gptj",
-    "GPTNeoXForCausalLM": "gpt_neox",
-    "MPTForCausalLM": "mpt",
-}
-
-# Define argument parser
-parser = argparse.ArgumentParser(description="Load a model with specified precision and save it to a specified path.")
-
-# Add arguments
-parser.add_argument(
-    "--checkpoint_path",
-    type=str,
-    help="Path to the pre-trained model checkpoint.",
-    default="/data/bli/checkpoints/OTTER-MPT7B-Instruct0705",
-)
-parser.add_argument(
-    "--save_path",
-    type=str,
-    default="/data/bli/checkpoints/OTTER-MPT7B-Instruct0705-LoRA",
-    help="Path to the converted model checkpoint.",
-)
-
-# Parse the input arguments
-args = parser.parse_args()
-
-# Load the model
-model = OtterForConditionalGeneration.from_pretrained(args.checkpoint_path, device_map="auto")
-
-# adding lora
-standard_modules = ["q_proj", "v_proj"]
-lang_encoder_short_name = MODEL_CLASSES[model.config.text_config.architectures[0]]
-model_to_lora_modules = {
-    "llama": standard_modules,
-    "opt": standard_modules,
-    "gptj": standard_modules,
-    "gpt_neox": ["query_key_value"],
-    "mpt": ["Wqkv"],
-}
-lora_config = LoraConfig(
-    r=16,
-    lora_alpha=32,
-    lora_dropout=0.05,
-    task_type=TaskType.CAUSAL_LM,
-    target_modules=model_to_lora_modules[lang_encoder_short_name],
-)
-model.config.update({"lora_config": {"r": 16, "lora_alpha": 32, "lora_dropout": 0.05}})
-model.lang_encoder = get_peft_model(model.lang_encoder, lora_config)
-
-# Save the model
-checkpoint_path = args.save_path
-OtterForConditionalGeneration.save_pretrained(model, checkpoint_path)
diff --git a/mllm/otter/flamingo_pt2otter_hf.py b/mllm/otter/flamingo_pt2otter_hf.py
deleted file mode 100644
index c1e7e68303b6274f6d23602e765fb435d211b3ef..0000000000000000000000000000000000000000
--- a/mllm/otter/flamingo_pt2otter_hf.py
+++ /dev/null
@@ -1,139 +0,0 @@
-""" convert from open flamingo pt to otter hf, as the starting point for ICI training
-    Deprecated and should change for MPT version OtterModel
-"""
-
-import re
-import argparse
-import os
-
-import torch
-import torch.nn as nn
-from transformers import CLIPVisionModel, LlamaForCausalLM, LlamaTokenizer
-
-from otter_ai.models.otter.modeling_otter import (
-    OtterPreTrainedModel,
-    OtterLMMixin,
-    extend_instance,
-    _infer_decoder_layers_attr_name,
-    OtterPerceiverResampler,
-)
-
-from otter_ai.models.otter.configuration_otter import OtterConfig
-
-
-class OtterModel(OtterPreTrainedModel):
-    # We need to download the llaMA and CLIP here, and the model does not have the <answer> when init
-    config_class = OtterConfig
-
-    def __init__(
-        self,
-        config: OtterConfig,
-    ):
-        super().__init__(config)
-        text_tokenizer = LlamaTokenizer.from_pretrained(config.text_config._name_or_path)
-        lang_encoder = LlamaForCausalLM.from_pretrained(config.text_config._name_or_path)
-        vision_encoder = CLIPVisionModel.from_pretrained(config.vision_config._name_or_path)
-
-        text_tokenizer.add_special_tokens({"additional_special_tokens": ["<|endofchunk|>", "<image>"]})
-        if text_tokenizer.pad_token is None:
-            text_tokenizer.add_special_tokens({"pad_token": "<PAD>"})
-        self.text_tokenizer = text_tokenizer
-        self.eoc_token_id = text_tokenizer.encode("<|endofchunk|>")[-1]
-        self.media_token_id = text_tokenizer.encode("<image>")[-1]
-
-        extend_instance(lang_encoder, OtterLMMixin)
-        decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
-        lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
-        lang_encoder.resize_token_embeddings(len(text_tokenizer))
-        self.lang_encoder = lang_encoder
-
-        self.cross_attn_every_n_layers = config.cross_attn_every_n_layers
-        self.use_media_placement_augmentation = config.use_media_placement_augmentation
-        self.only_attend_previous = config.only_attend_previous
-
-        vision_encoder.output_tokens = True
-        self.vision_encoder = vision_encoder
-
-        self.vis_dim = 1024
-        self.perceiver = OtterPerceiverResampler(dim=self.vis_dim)
-
-        self.lang_encoder.init_otter(
-            media_token_id=self.media_token_id,
-            vis_hidden_size=self.vis_dim,
-            cross_attn_every_n_layers=self.cross_attn_every_n_layers,
-            use_media_placement_augmentation=self.use_media_placement_augmentation,
-            only_attend_previous=self.only_attend_previous,
-        )
-
-    def get_input_embeddings(self) -> nn.Module:
-        return self.lang_encoder.get_input_embeddings()
-
-    def set_input_embeddings(self, new_embeddings):
-        self.lang_encoder.set_input_embeddings(new_embeddings)
-
-    def get_output_embeddings(self) -> nn.Module:
-        return self.lang_encoder.get_output_embeddings()
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lang_encoder.set_output_embeddings(new_embeddings)
-
-
-def rename_flamingo_checkpoint(old_ckpt: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
-    """Rename some keys in the public flamingo checkpoint"""
-    perceiver_pattern1 = re.compile(r"perceiver\.layers\.[0-9]\.0")
-    perceiver_pattern2 = re.compile(r"perceiver\.layers\.[0-9]\.1")
-    new_ckpt = old_ckpt.copy()
-    for key, value in old_ckpt.items():
-        if re.match(perceiver_pattern1, key):
-            new_key = re.sub(r"([0-9])\.0", r"\1", key)
-            new_ckpt.pop(key)
-            new_ckpt[new_key] = value
-        elif re.match(perceiver_pattern2, key):
-            new_key = re.sub(r"([0-9])\.1", r"\1.feed_forward", key)
-            new_ckpt.pop(key)
-            new_ckpt[new_key] = value
-        elif key.startswith("lang_encoder.gated_cross_attn_layers."):
-            new_ckpt.pop(key)
-        elif key.startswith("lang_encoder.") and "ff_gate" not in key:
-            new_key = key.replace("ff", "feed_forward")
-            new_ckpt.pop(key)
-            new_ckpt[new_key] = value
-
-    return new_ckpt
-
-
-@torch.no_grad()
-def dump_hf_model(old_ckpt_path: str, new_folder_path: str) -> None:
-    os.makedirs(new_folder_path, exist_ok=True)
-    old_ckpt = torch.load(old_ckpt_path, map_location="cpu")
-    if old_ckpt.get("model", None) is not None:
-        old_ckpt = old_ckpt["model"]
-    config = OtterConfig.from_json_file("otter/config.json")
-    model = OtterModel(config)
-    new_ckpt = rename_flamingo_checkpoint(old_ckpt)
-    model.load_state_dict(new_ckpt, strict=False)
-    text_tokenizer = model.text_tokenizer
-    text_tokenizer.add_special_tokens({"additional_special_tokens": ["<|endofchunk|>", "<image>", "<answer>"]})
-    model.lang_encoder.resize_token_embeddings(len(text_tokenizer))
-    print(f"Saving HF model to {new_folder_path}")
-    model.save_pretrained(new_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--old_ckpt_path",
-        "-old",
-        type=str,
-        required=True,
-        help="Path to the Open Flamingo checkpoint",
-    )
-    parser.add_argument(
-        "--new_hf_path",
-        "-new",
-        type=str,
-        required=True,
-        help="Path to the HF folder",
-    )
-    args = parser.parse_args()
-    dump_hf_model(args.old_ckpt_path, args.new_hf_path)
diff --git a/mllm/otter/modeling_otter.py b/mllm/otter/modeling_otter.py
deleted file mode 100755
index d0e450fe441fe425b577cde7ca3e80c48ca8cffc..0000000000000000000000000000000000000000
--- a/mllm/otter/modeling_otter.py
+++ /dev/null
@@ -1,1025 +0,0 @@
-from typing import Optional, List
-
-import torch
-import torch.nn as nn
-from transformers.modeling_utils import PreTrainedModel
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from einops import rearrange, repeat
-from accelerate.hooks import add_hook_to_module, AlignDevicesHook
-
-from .configuration_otter import OtterConfig
-
-from mllm.flamingo.falcon.modelling_RW import RWForCausalLM
-from mllm.flamingo.mpt.modeling_mpt import MPTForCausalLM
-from mllm.flamingo.mpt_redpajama.mosaic_gpt import MosaicGPT
-
-from transformers.models.auto import AutoModel, AutoModelForCausalLM, AutoTokenizer
-from peft import get_peft_model, LoraConfig, TaskType
-
-import sys
-import random
-
-# The package importlib_metadata is in a different place, depending on the python version.
-if sys.version_info < (3, 8):
-    import importlib_metadata
-else:
-    import importlib.metadata as importlib_metadata
-
-import torch.distributed as dist
-
-# Add this line at the beginning of your script or in your main function
-# dist.init_process_group(backend='nccl')
-
-XFORMERS_AVAIL = False
-XFORMERS_MSG_PRINTED = False  # Add this global variable
-try:
-    if not XFORMERS_MSG_PRINTED:  # Check if the message has been printed before
-        import xformers.ops as xops
-        from xformers_model import CLIPVisionModel, LlamaForCausalLM
-        from transformers import LlamaTokenizer
-
-        _xformers_version = importlib_metadata.version("xformers")
-        if dist.is_initialized() and dist.get_rank() == 0:  # Check if the current process rank is 0
-            print(f"Successfully imported xformers version {_xformers_version}")
-except ImportError as e:
-    if not XFORMERS_MSG_PRINTED:  # Check if the message has been printed before
-        from transformers import CLIPVisionModel, LlamaForCausalLM, LlamaTokenizer
-
-        if dist.is_initialized() and dist.get_rank() == 0:  # Check if the current process rank is 0
-            print(f"Failed to import xformers: {e}")
-            XFORMERS_AVAIL = False
-            print("No xformers found. You are recommended to install xformers via `pip install xformers` or `conda install -c xformers xformers`")
-            XFORMERS_MSG_PRINTED = True  # Set the variable to True after printing the message
-
-# from transformers import CLIPVisionModel, LlamaForCausalLM, LlamaTokenizer
-
-__KNOWN_DECODER_LAYERS_ATTR_NAMES = {
-    "opt": "model.decoder.layers",
-    "gptneo": "transformer.h",
-    "gptj": "transformer.h",
-    "gpt-j": "transformer.h",
-    "pythia": "gpt_neox.layers",
-    "llama": "model.layers",
-    "RWForCausalLM": "transformer.h",
-    "MPTForCausalLM": "transformer.blocks",
-    "MosaicGPT": "transformer.blocks",
-}
-
-MODEL_CLASSES = {
-    "LlamaForCausalLM": "llama",
-    "OPTForCausalLM": "opt",
-    "GPTJForCausalLM": "gptj",
-    "GPTNeoXForCausalLM": "gpt_neox",
-    "MPTForCausalLM": "mpt",
-    "MosaicGPT": "mpt",
-}
-
-
-def _infer_decoder_layers_attr_name(model: nn.Module):
-    for k in __KNOWN_DECODER_LAYERS_ATTR_NAMES:
-        if k.lower() in model.__class__.__name__.lower():
-            return __KNOWN_DECODER_LAYERS_ATTR_NAMES[k]
-
-    raise ValueError(
-        f"We require the attribute name for the nn.ModuleList in the decoder storing the transformer block layers. Please supply this string manually."
-    )
-
-
-def extend_instance(obj, mixin):
-    """Apply mixins to a class instance after creation"""
-    base_cls = obj.__class__
-    base_cls_name = obj.__class__.__name__
-    obj.__class__ = type(base_cls_name, (mixin, base_cls), {})  # mixin needs to go first for our forward() logic to work
-
-
-def getattr_recursive(obj, att):
-    """
-    Return nested attribute of obj
-    Example: getattr_recursive(obj, 'a.b.c') is equivalent to obj.a.b.c
-    """
-    if att == "":
-        return obj
-    i = att.find(".")
-    if i < 0:
-        return getattr(obj, att)
-    else:
-        return getattr_recursive(getattr(obj, att[:i]), att[i + 1 :])
-
-
-def setattr_recursive(obj, att, val):
-    """
-    Set nested attribute of obj
-    Example: setattr_recursive(obj, 'a.b.c', val) is equivalent to obj.a.b.c = val
-    """
-    if "." in att:
-        obj = getattr_recursive(obj, ".".join(att.split(".")[:-1]))
-    setattr(obj, att.split(".")[-1], val)
-
-
-def exists(val):
-    return val is not None
-
-
-class OtterPerceiverBlock(nn.Module):
-    def __init__(self, *, dim: int, dim_head: int = 64, heads: int = 8, mult: int = 4):
-        super().__init__()
-        self.scale = dim_head**-0.5
-        self.heads = heads
-        inner_dim = dim_head * heads
-        ff_dim = dim * mult
-        self.norm_media = nn.LayerNorm(dim)
-        self.norm_latents = nn.LayerNorm(dim)
-
-        self.to_q = nn.Linear(dim, inner_dim, bias=False)
-        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
-        self.to_out = nn.Linear(inner_dim, dim, bias=False)
-        self.feed_forward = nn.ModuleList(
-            [
-                nn.LayerNorm(dim),
-                nn.Linear(dim, ff_dim, bias=False),
-                nn.GELU(),
-                nn.Linear(ff_dim, dim, bias=False),
-            ]
-        )
-
-    def forward(self, x: torch.Tensor, latents: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            x (torch.Tensor): image features
-                shape (b, T, n1, D)
-            latent (torch.Tensor): latent features
-                shape (b, T, n2, D)
-        """
-        x = self.norm_media(x)
-        residual_latents = latents
-        latents = self.norm_latents(latents)
-
-        h = self.heads
-
-        q = self.to_q(latents)
-        kv_input = torch.cat((x, latents), dim=-2)
-        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
-        q = rearrange(q, "b t n (h d) -> b h t n d", h=h)
-        k = rearrange(k, "b t n (h d) -> b h t n d", h=h)
-        v = rearrange(v, "b t n (h d) -> b h t n d", h=h)
-        q = q * self.scale
-
-        # attention
-        sim = torch.einsum("... i d, ... j d  -> ... i j", q, k)
-        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
-        attn = sim.softmax(dim=-1)
-
-        out = torch.einsum("... i j, ... j d -> ... i d", attn, v)
-        out = rearrange(out, "b h t n d -> b t n (h d)", h=h)
-        out = self.to_out(out) + residual_latents
-        residual_out = out
-        for layer in self.feed_forward:
-            out = layer(out)
-        return out + residual_out
-
-
-class OtterPerceiverResampler(nn.Module):
-    def __init__(
-        self,
-        *,
-        dim: int,
-        depth: int = 6,
-        dim_head: int = 64,
-        heads: int = 8,
-        num_latents: int = 64,
-        # max_num_frames: int = 128,
-        max_num_media: Optional[int] = None,
-        max_num_frames: Optional[int] = None,
-        ff_mult: int = 4,
-    ):
-        super().__init__()
-        self.latents = nn.Parameter(torch.randn(num_latents, dim))
-        self.frame_embs = nn.Parameter(torch.randn(max_num_frames, dim)) if exists(max_num_frames) else None
-
-        self.media_time_embs = nn.Parameter(torch.randn(max_num_media, 1, dim)) if exists(max_num_media) else None
-
-        self.layers = nn.ModuleList([])
-        for _ in range(depth):
-            self.layers.append(OtterPerceiverBlock(dim=dim, dim_head=dim_head, heads=heads, mult=ff_mult))
-
-        self.norm = nn.LayerNorm(dim)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            x (torch.Tensor): image features
-                shape (b, T, F, v, D)
-        Returns:
-            shape (b, T, n, D) where n is self.num_latents
-        """
-        b, T, F, v = x.shape[:4]
-
-        # frame and media time embeddings
-        if exists(self.frame_embs):
-            frame_embs = repeat(self.frame_embs[:F], "F d -> b T F v d", b=b, T=T, v=v)
-            x = x + frame_embs
-        x = rearrange(x, "b T F v d -> b T (F v) d")  # flatten the frame and spatial dimensions
-        if exists(self.media_time_embs):
-            x = x + self.media_time_embs[:T]
-
-        # blocks
-        latents = repeat(self.latents, "n d -> b T n d", b=b, T=T)
-        for block in self.layers:
-            latents = block(x, latents)
-        return self.norm(latents)
-
-
-class OtterMaskedCrossAttention(nn.Module):
-    def __init__(
-        self,
-        *,
-        dim: int,
-        dim_visual: int,
-        dim_head: int = 64,
-        heads: int = 8,
-        only_attend_immediate_media: bool = True,
-    ):
-        super().__init__()
-        self.scale = dim_head**-0.5
-        self.heads = heads
-        inner_dim = dim_head * heads
-
-        self.norm = nn.LayerNorm(dim)
-
-        self.to_q = nn.Linear(dim, inner_dim, bias=False)
-        self.to_kv = nn.Linear(dim_visual, inner_dim * 2, bias=False)
-        self.to_out = nn.Linear(inner_dim, dim, bias=False)
-
-        # whether for text to only attend to immediate preceding image, or all previous images
-        self.only_attend_immediate_media = only_attend_immediate_media
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        media: torch.Tensor,
-        media_locations: Optional[torch.BoolTensor] = None,
-        attend_previous: bool = True,
-    ) -> torch.Tensor:
-        """
-        Args:
-            x (torch.Tensor): text features
-                shape (B, T_txt, D_txt)
-            media (torch.Tensor): image features
-                shape (B, T_img, n, D_img) where n is the dim of the latents
-            media_locations: boolean mask identifying the media tokens in x
-                shape (B, T_txt)
-            attend_previous: bool
-                If false, ignores immediately preceding image and starts attending when following image
-        """
-        _, T_img, n = media.shape[:3]
-        h = self.heads
-
-        x = self.norm(x)
-
-        q = self.to_q(x)
-        media = rearrange(media, "b t n d -> b (t n) d")
-
-        k, v = self.to_kv(media).chunk(2, dim=-1)
-        if not XFORMERS_AVAIL:
-            q = rearrange(q, "b n (h d) -> b h n d", h=h)
-            k = rearrange(k, "b n (h d) -> b h n d", h=h)
-            v = rearrange(v, "b n (h d) -> b h n d", h=h)
-            q = q * self.scale
-
-            sim = torch.einsum("... i d, ... j d -> ... i j", q, k)
-            if exists(media_locations):
-                # at each boolean of True, increment the time counter (relative to media time)
-                text_time = media_locations.cumsum(dim=-1)
-                media_time = torch.arange(T_img, device=x.device) + 1
-
-                if not attend_previous:
-                    text_time[~media_locations] += 1
-                    # make sure max is still the number of images in the sequence
-                    text_time[
-                        text_time
-                        > repeat(
-                            torch.count_nonzero(media_locations, dim=1),
-                            "b -> b i",
-                            i=text_time.shape[1],
-                        )
-                    ] = 0
-
-                # text time must equal media time if only attending to most immediate image
-                # otherwise, as long as text time is greater than media time (if attending to all previous images / media)
-                mask_op = torch.eq if self.only_attend_immediate_media else torch.ge
-
-                text_to_media_mask = mask_op(
-                    rearrange(text_time, "b i -> b 1 i 1"),
-                    repeat(media_time, "j -> 1 1 1 (j n)", n=n),
-                )
-                sim = sim.masked_fill(~text_to_media_mask, -torch.finfo(sim.dtype).max)
-
-            sim = sim - sim.amax(dim=-1, keepdim=True).detach()
-            attn = sim.softmax(dim=-1)
-
-            if exists(media_locations) and self.only_attend_immediate_media:
-                # any text without a preceding media needs to have attention zeroed out
-                text_without_media_mask = text_time == 0
-                text_without_media_mask = rearrange(text_without_media_mask, "b i -> b 1 i 1")
-                attn = attn.masked_fill(text_without_media_mask, 0.0)
-
-            out = torch.einsum("... i j, ... j d -> ... i d", attn, v)
-            out = rearrange(out, "b h n d -> b n (h d)")
-        else:
-            q = rearrange(q, "b n (h d) -> b n h d", h=h)
-            k = rearrange(k, "b n (h d) -> b n h d", h=h)
-            v = rearrange(v, "b n (h d) -> b n h d", h=h)
-            attn_mask = None
-            out = xops.memory_efficient_attention(q, k, v, attn_bias=attn_mask, scale=self.scale)
-        return self.to_out(out)
-
-
-class OtterGatedCrossAttentionBlock(nn.Module):
-    def __init__(
-        self,
-        *,
-        dim: int,
-        dim_visual: int,
-        dim_head: int = 64,
-        heads: int = 8,
-        ff_mult: int = 4,
-        only_attend_immediate_media: bool = True,
-    ):
-        super().__init__()
-        self.attn = OtterMaskedCrossAttention(
-            dim=dim,
-            dim_visual=dim_visual,
-            dim_head=dim_head,
-            heads=heads,
-            only_attend_immediate_media=only_attend_immediate_media,
-        )
-        self.attn_gate = nn.Parameter(torch.tensor([0.0]))
-        self.feed_forward = nn.ModuleList(
-            [
-                nn.LayerNorm(dim),
-                nn.Linear(dim, dim * ff_mult, bias=False),
-                nn.GELU(),
-                nn.Linear(dim * ff_mult, dim, bias=False),
-            ]
-        )
-        self.ff_gate = nn.Parameter(torch.tensor([0.0]))
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        media: torch.Tensor,
-        media_locations: Optional[torch.BoolTensor] = None,
-        attend_previous: bool = True,
-    ) -> torch.Tensor:
-        x = (
-            self.attn(
-                x,
-                media,
-                media_locations=media_locations,
-                attend_previous=attend_previous,
-            )
-            * self.attn_gate.tanh()
-            + x
-        )
-        residual_x = x
-        for ff in self.feed_forward:
-            x = ff(x)
-        x = x * self.ff_gate.tanh() + residual_x
-
-        return x
-
-
-class OtterLayer(nn.Module):
-    def __init__(self, gated_cross_attn_layer: nn.Module, decoder_layer: nn.Module):
-        super().__init__()
-        self.gated_cross_attn_layer = gated_cross_attn_layer
-        self.decoder_layer = decoder_layer
-        self.vis_x = None
-        self.media_locations = None
-
-    def is_conditioned(self) -> bool:
-        """Check whether the layer is conditioned."""
-        return self.vis_x is not None
-
-    # Used this great idea from this implementation of Otter (https://github.com/dhansmair/otter-mini/)
-    def condition_vis_x(self, vis_x) -> None:
-        self.vis_x = vis_x
-
-    def condition_media_locations(self, media_locations) -> None:
-        self.media_locations = media_locations
-
-    def condition_attend_previous(self, attend_previous) -> None:
-        self.attend_previous = attend_previous
-
-    def forward(
-        self,
-        lang_x: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        **decoder_layer_kwargs,
-    ):
-        if self.gated_cross_attn_layer is None:
-            return self.decoder_layer(lang_x, attention_mask=attention_mask, **decoder_layer_kwargs)
-
-        if self.vis_x is None:
-            raise ValueError("vis_x must be conditioned before forward pass")
-
-        if self.media_locations is None:
-            raise ValueError("media_locations must be conditioned before forward pass")
-
-        lang_x = self.gated_cross_attn_layer(
-            lang_x,
-            self.vis_x,
-            media_locations=self.media_locations,
-            attend_previous=self.attend_previous,
-        )
-        lang_x = self.decoder_layer(lang_x, attention_mask=attention_mask, **decoder_layer_kwargs)
-        return lang_x
-
-
-class OtterLMMixin(nn.Module):
-    """
-    Mixin to add cross-attention layers to a language model.
-    """
-
-    def set_decoder_layers_attr_name(self, decoder_layers_attr_name):
-        self.decoder_layers_attr_name = decoder_layers_attr_name
-
-    def _get_decoder_layers(self):
-        return getattr_recursive(self, self.decoder_layers_attr_name)
-
-    def _set_decoder_layers(self, value):
-        setattr_recursive(self, self.decoder_layers_attr_name, value)
-
-    def init_otter(
-        self,
-        media_token_id: int,
-        vis_hidden_size: int,
-        cross_attn_every_n_layers: int,
-        use_media_placement_augmentation: bool,
-    ):
-        """
-        Initialize Otter by adding a new gated cross attn to the decoder. Store the media token id for computing the media locations.
-        """
-
-        gated_cross_attn_layers = nn.ModuleList(
-            [
-                OtterGatedCrossAttentionBlock(
-                    dim=self.config.hidden_size,
-                    dim_visual=vis_hidden_size,
-                )
-                if (layer_idx + 1) % cross_attn_every_n_layers == 0
-                else None
-                for layer_idx, _ in enumerate(self._get_decoder_layers())
-            ]
-        )
-        self._set_decoder_layers(
-            nn.ModuleList(
-                [
-                    OtterLayer(gated_cross_attn_layer, decoder_layer)
-                    for gated_cross_attn_layer, decoder_layer in zip(gated_cross_attn_layers, self._get_decoder_layers())
-                ]
-            )
-        )
-        self.media_token_id = media_token_id
-        self.use_media_placement_augmentation = use_media_placement_augmentation
-        self.initialized_otter = True
-
-    def forward(self, *input, **kwargs):
-        """Condition the Otter layers on the media locations before forward()"""
-        if not self.initialized_otter:
-            raise ValueError("Otter layers are not initialized. Please call `init_otter` first.")
-
-        input_ids = kwargs["input_ids"] if "input_ids" in kwargs else input[0]
-        media_locations = input_ids == self.media_token_id
-        # IMPORTANT: Force `attend_previous` to True when we place training data as <image>caption<|endofchunk|>
-        # attend_previous = (
-        #     (random.random() < 0.5) if self.use_media_placement_augmentation else False
-        # )
-        attend_previous = (random.random() < 0.5) if self.use_media_placement_augmentation else True
-        # attend_previous = self.only_attend_previous
-
-        if self.__class__.__name__ == "LlamaForCausalLM":
-            for layer in self.get_decoder().layers:
-                layer.condition_media_locations(media_locations)
-                layer.condition_attend_previous(attend_previous)
-        elif self.__class__.__name__ in ["MPTForCausalLM", "MosaicGPT"]:
-            for layer in self.get_decoder().blocks:
-                layer.condition_media_locations(media_locations)
-                layer.condition_attend_previous(attend_previous)
-        else:
-            print("inavaliable text encoder")
-        return super().forward(*input, **kwargs)  # Call the other parent's forward method
-
-    def is_conditioned(self) -> bool:
-        """Check whether all decoder layers are already conditioned."""
-        return all(l.is_conditioned() for l in self._get_decoder_layers())
-
-    def clear_conditioned_layers(self) -> None:
-        for layer in self._get_decoder_layers():
-            layer.condition_vis_x(None)
-            layer.condition_media_locations(None)
-            layer.condition_attend_previous(None)
-
-
-class OtterPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = OtterConfig
-    base_model_prefix = "otter"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["OtterPerceiverBlock", "CLIPEncoderLayer", "OtterLayer"]
-
-    def _init_weights(self, module):
-        """Otter requires no specific initialization"""
-        return super()._init_weights(module)
-
-
-class OtterModel(OtterPreTrainedModel):
-    config_class = OtterConfig
-
-    def __init__(
-        self,
-        config: OtterConfig,
-    ):
-        super().__init__(config)
-
-        ### TODO: give "LlamaForCausalLM" as the name of text_config.architectures of Llama_based flamingo
-        if "llama" not in config.text_config._name_or_path:
-            if config.text_config.architectures[0] == "MPTForCausalLM":
-                text_tokenizer = AutoTokenizer.from_pretrained("mosaicml/mpt-7b-instruct")
-                lang_encoder = MPTForCausalLM(config=config.text_config)
-            elif config.text_config.text_config.architectures[0] == "MosaicGPT":
-                text_tokenizer = AutoTokenizer.from_pretrained("mosaicml/mosaic-llama-redpajama-final-candidate")
-                lang_encoder = MosaicGPT(config=config.text_config)
-            elif config.text_config.architectures[0] == "RWForCausalLM":
-                text_tokenizer = AutoTokenizer.from_pretrained("PATH-TO-YOUR-FALCON")
-                lang_encoder = RWForCausalLM(config=config.text_config)
-        else:
-            text_tokenizer = LlamaTokenizer.from_pretrained(config.text_config._name_or_path)
-            lang_encoder = LlamaForCausalLM(config=config.text_config)
-        vision_encoder = CLIPVisionModel(config=config.vision_config)
-        text_tokenizer.add_special_tokens({"additional_special_tokens": ["<|endofchunk|>", "<image>", "<answer>"]})
-        if text_tokenizer.pad_token is None:
-            text_tokenizer.add_special_tokens({"pad_token": "<PAD>"})
-        self.text_tokenizer = text_tokenizer
-        self.eoc_token_id = text_tokenizer.encode("<|endofchunk|>")[-1]
-        self.media_token_id = text_tokenizer.encode("<image>")[-1]
-
-        extend_instance(lang_encoder, OtterLMMixin)
-        decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
-        lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
-        if lang_encoder.__class__.__name__ == "LlamaForCausalLM":
-            lang_encoder.resize_token_embeddings(len(text_tokenizer))
-        self.lang_encoder = lang_encoder
-
-        self.cross_attn_every_n_layers = config.cross_attn_every_n_layers
-        # use_media_placement_augmentation is strictly false for Otter model
-        self.use_media_placement_augmentation = False  # config.use_media_placement_augmentation
-        self.max_num_frames = config.max_num_frames if hasattr(config, "max_num_frames") else None
-
-        vision_encoder.output_tokens = True
-        self.vision_encoder = vision_encoder
-
-        self.vis_dim = 1024
-        self.perceiver = OtterPerceiverResampler(dim=self.vis_dim, max_num_frames=self.max_num_frames)
-
-        self.lang_encoder.init_otter(
-            media_token_id=self.media_token_id,
-            vis_hidden_size=self.vis_dim,
-            cross_attn_every_n_layers=self.cross_attn_every_n_layers,
-            use_media_placement_augmentation=self.use_media_placement_augmentation,
-        )
-
-        if "lora_config" in config.__dict__:
-            print(f"Using LoRA with config:{config.lora_config}")
-            standard_modules = ["q_proj", "v_proj"]
-            if config.text_config.architectures is None: config.text_config.architectures = ['LlamaForCausalLM']
-            lang_encoder_short_name = MODEL_CLASSES[config.text_config.architectures[0]]
-            model_to_lora_modules = {
-                "llama": standard_modules,
-                "opt": standard_modules,
-                "gptj": standard_modules,
-                "gpt_neox": ["query_key_value"],
-                "mpt": ["Wqkv"],
-            }
-            lora_config = LoraConfig(
-                r=config.lora_config["r"],
-                lora_alpha=config.lora_config["lora_alpha"],
-                lora_dropout=config.lora_config["lora_dropout"],
-                task_type=TaskType.CAUSAL_LM,
-                target_modules=model_to_lora_modules[lang_encoder_short_name],
-            )
-            self.lang_encoder = get_peft_model(self.lang_encoder, lora_config)
-            self.lang_encoder.print_trainable_parameters()
-
-        self.post_init()
-
-    def get_input_embeddings(self) -> nn.Module:
-        return self.lang_encoder.get_input_embeddings()
-
-    def set_input_embeddings(self, new_embeddings):
-        self.lang_encoder.set_input_embeddings(new_embeddings)
-
-    def get_output_embeddings(self) -> nn.Module:
-        return self.lang_encoder.get_output_embeddings()
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lang_encoder.set_output_embeddings(new_embeddings)
-
-    def get_image_encoder(self) -> nn.Module:
-        return self.vision_encoder
-
-    def get_lang_encoder(self) -> nn.Module:
-        return self.lang_encoder
-
-    def tie_weights(self):
-        return super().tie_weights()
-
-    def init_weights(self):
-        # Freeze all parameters in vision encoder
-        for param in self.vision_encoder.parameters():
-            param.requires_grad = False
-        # Freeze all parameters in lang encoders except gated_cross_attn_layers
-        for name, param in self.lang_encoder.named_parameters():
-            if "gated_cross_attn_layer" not in name:
-                param.requires_grad = False
-        # Unfreeze LM input embeddings
-        self.lang_encoder.get_input_embeddings().requires_grad_(True)
-        ## MPTForCausalLM is tied word embedding
-        if self.lang_encoder.__class__.__name__ == "LlamaForCausalLM":
-            self.lang_encoder.lm_head.requires_grad_(True)
-        # assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0
-        # print model size in billions of parameters in 2 decimal places
-        print(f"Trainable param: {(sum(p.numel() for p in self.parameters() if p.requires_grad)) / 1e9:.2f} B")
-
-    def forward(
-        self,
-        vision_x: torch.Tensor,
-        lang_x: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cached_vision_x: bool = False,
-        clear_conditioned_layers: bool = True,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> CausalLMOutputWithPast:
-        """
-        Forward pass of Otter.
-
-        Args:
-            vision_x (torch.Tensor): Vision input
-                shape (B, T_img, F, C, H, W) with F=1
-            lang_x (torch.Tensor): Language input ids
-                shape (B, T_txt)
-            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
-            labels (torch.Tensor, optional): Labels. Defaults to None.
-            clear_conditioned_layers: if True, clear the conditioned layers
-                once the foward pass is completed. Set this to false if the
-                same set of images will be reused in another subsequent
-                forward pass.
-            past_key_values: pre-computed values to pass to language model.
-                See past_key_values documentation in Hugging Face
-                CausalLM models.
-            use_cache: whether to use cached key values. See use_cache
-                documentation in Hugging Face CausalLM models.
-        """
-        assert (vision_x is not None) or use_cached_vision_x, "Must provide either vision_x or use_cached_vision_x to True."
-
-        if use_cached_vision_x:
-            # Case: use cached; vision_x should be cached and other
-            # vision-related inputs should not be provided.
-            assert vision_x is None, "Expect vision_x to be None when use_cached_vision_x is True."
-            assert self.lang_encoder.is_conditioned()
-
-        else:
-            # Case: do not use caching (i.e. this is a standard forward pass);
-            self._encode_vision_x(vision_x=vision_x)
-
-        output = self.lang_encoder(
-            input_ids=lang_x,
-            attention_mask=attention_mask,
-            labels=labels,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            **kwargs,
-        )
-
-        if clear_conditioned_layers:
-            self.lang_encoder.clear_conditioned_layers()
-
-        return output
-
-    def _encode_vision_x(self, vision_x: torch.Tensor):
-        """
-        Compute media tokens from vision input by passing it through vision encoder and conditioning language model.
-        Args:
-            vision_x (torch.Tensor): Vision input
-                shape (B, T_img, F, C, H, W)
-                Images in the same chunk are collated along T_img, and frames are collated along F
-                Currently only F=1 is supported (single-frame videos)
-
-        rearrange code based on https://github.com/dhansmair/flamingo-mini
-        """
-
-        assert vision_x.ndim == 6, "vision_x should be of shape (b, T_img, F, C, H, W)"
-        b, T, F = vision_x.shape[:3]
-
-        vision_x = rearrange(vision_x, "b T F c h w -> (b T F) c h w")
-        with torch.no_grad():
-            vision_x = self.vision_encoder(vision_x)[0][:, 1:, :]
-        vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F)
-
-        vision_x = self.perceiver(vision_x)  # reshapes to (b, T, n, d)
-
-        for layer in self.lang_encoder._get_decoder_layers():
-            layer.condition_vis_x(vision_x)
-
-
-class OtterForConditionalGeneration(OtterPreTrainedModel):
-    config_class = OtterConfig
-
-    def __init__(
-        self,
-        config: OtterConfig,
-    ):
-        super().__init__(config)
-        ### TODO: give "LlamaForCausalLM" as the name of text_config.architectures of Llama_based flamingo
-        if "llama" not in config.text_config._name_or_path:
-            if config.text_config.architectures[0] == "MPTForCausalLM":
-                text_tokenizer = AutoTokenizer.from_pretrained("mosaicml/mpt-7b-instruct")
-                lang_encoder = MPTForCausalLM(config=config.text_config)
-            elif config.text_config.architectures[0] == "MosaicGPT":
-                text_tokenizer = AutoTokenizer.from_pretrained("mosaicml/mosaic-llama-redpajama-final-candidate")
-                lang_encoder = MosaicGPT(config=config.text_config)
-            elif config.text_config.architectures[0] == "RWForCausalLM":
-                text_tokenizer = AutoTokenizer.from_pretrained("PATH-TO-YOUR-FALCON")
-                lang_encoder = RWForCausalLM(config=config.text_config)
-            elif config.text_config.architectures[0] == "LlamaForCausalLM":
-                text_tokenizer = LlamaTokenizer.from_pretrained(config.text_config._name_or_path)
-                lang_encoder = LlamaForCausalLM(config=config.text_config)
-            else:
-                import pdb
-
-                pdb.set_trace()
-        else:
-            text_tokenizer = LlamaTokenizer.from_pretrained(config.text_config._name_or_path)
-            lang_encoder = LlamaForCausalLM(config=config.text_config)
-        vision_encoder = CLIPVisionModel(config=config.vision_config)
-
-        text_tokenizer.add_special_tokens({"additional_special_tokens": ["<|endofchunk|>", "<image>", "<answer>"]})
-        if text_tokenizer.pad_token is None:
-            text_tokenizer.add_special_tokens({"pad_token": "<PAD>"})
-        self.text_tokenizer = text_tokenizer
-        self.eoc_token_id = text_tokenizer.encode("<|endofchunk|>")[-1]
-        self.media_token_id = text_tokenizer.encode("<image>")[-1]
-
-        extend_instance(lang_encoder, OtterLMMixin)
-        decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
-        lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
-        if lang_encoder.__class__.__name__ == "LlamaForCausalLM":
-            lang_encoder.resize_token_embeddings(len(text_tokenizer))
-        self.lang_encoder = lang_encoder
-
-        self.cross_attn_every_n_layers = config.cross_attn_every_n_layers
-        # use_media_placement_augmentation is strictly false for Otter model
-        self.use_media_placement_augmentation = False  # config.use_media_placement_augmentation
-        self.max_num_frames = config.max_num_frames if hasattr(config, "max_num_frames") else None
-
-        # Informative print statement
-        if self.max_num_frames is None or self.max_num_frames == 1:
-            print(f"The current model version is configured for Otter-Image with max_num_frames set to {self.max_num_frames}.")
-        else:
-            print(f"The current model version is configured for Otter-Video with a maximum of {self.max_num_frames} frames.")
-
-        vision_encoder.output_tokens = True
-        self.vision_encoder = vision_encoder
-
-        self.vis_dim = 1024
-        self.perceiver = OtterPerceiverResampler(dim=self.vis_dim, max_num_frames=self.max_num_frames)
-
-        self.lang_encoder.init_otter(
-            media_token_id=self.media_token_id,
-            vis_hidden_size=self.vis_dim,
-            cross_attn_every_n_layers=self.cross_attn_every_n_layers,
-            use_media_placement_augmentation=self.use_media_placement_augmentation,
-        )
-
-        if "lora_config" in config.__dict__:
-            original_architecture_name = self.lang_encoder.__class__.__name__
-            print(f"Using LoRA with config:{config.lora_config}")
-            standard_modules = ["q_proj", "v_proj"]
-            if config.text_config.architectures is None: config.text_config.architectures = ['LlamaForCausalLM']
-            lang_encoder_short_name = MODEL_CLASSES[config.text_config.architectures[0]]
-            model_to_lora_modules = {
-                "llama": standard_modules,
-                "opt": standard_modules,
-                "gptj": standard_modules,
-                "gpt_neox": ["query_key_value"],
-                "mpt": ["Wqkv"],
-            }
-            lora_config = LoraConfig(
-                r=config.lora_config["r"],
-                lora_alpha=config.lora_config["lora_alpha"],
-                lora_dropout=config.lora_config["lora_dropout"],
-                task_type=TaskType.CAUSAL_LM,
-                target_modules=model_to_lora_modules[lang_encoder_short_name],
-            )
-            self.lang_encoder = get_peft_model(self.lang_encoder, lora_config)
-            self.lang_encoder.print_trainable_parameters()
-            self.lang_encoder.__class__.__name__ = f"{original_architecture_name}LoRA"
-
-        self.post_init()
-
-    def get_input_embeddings(self) -> nn.Module:
-        return self.lang_encoder.get_input_embeddings()
-
-    def set_input_embeddings(self, new_embeddings):
-        self.lang_encoder.set_input_embeddings(new_embeddings)
-
-    def get_output_embeddings(self) -> nn.Module:
-        return self.lang_encoder.get_output_embeddings()
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lang_encoder.set_output_embeddings(new_embeddings)
-
-    def get_image_encoder(self) -> nn.Module:
-        return self.vision_encoder
-
-    def get_lang_encoder(self) -> nn.Module:
-        return self.lang_encoder
-
-    def init_weights(self):
-        # Freeze all parameters in self.model
-        for param in self.parameters():
-            param.requires_grad = False
-
-        # Freeze all parameters in vision encoder
-        if "train_vision_encoder" in self.config.__dict__ and self.config.train_vision_encoder is True:
-            for param in self.vision_encoder.parameters():
-                param.requires_grad = True
-
-        # Freeze all parameters in lang encoders except gated_cross_attn_layers
-        if "train_lang_encoder" in self.config.__dict__ and self.config.train_lang_encoder is True:
-            for name, param in self.lang_encoder.named_parameters():
-                param.requires_grad = True
-
-        if "lora_config" in self.config.__dict__:
-            # Use another logic to unfreeze gated_cross_attn_layers and perceivers
-            print(f"LoRA trainable param: {(sum(param.numel() for name, param in self.lang_encoder.named_parameters() if 'lora' in name)) / 1e6:.3f} M")
-            for name, param in self.lang_encoder.named_parameters():
-                if "lora" in name:
-                    param.requires_grad = True
-
-        # Freeze all parameters in lang encoders except gated_cross_attn_layers
-        for name, param in self.lang_encoder.named_parameters():
-            if "gated_cross_attn_layer" in name:
-                param.requires_grad = True
-
-        for name, param in self.named_parameters():
-            if "perceiver" in name:
-                param.requires_grad = True
-        # Unfreeze LM input and output embeddings
-        self.lang_encoder.get_input_embeddings().requires_grad_(True)
-        ## MPTForCausalLM is tied word embedding
-        if "LlamaForCausalLM" in self.lang_encoder.__class__.__name__:
-            self.lang_encoder.lm_head.requires_grad_(True)
-        # print("====================Model Grad Part====================")
-        total_params = 0
-        for name, param in self.named_parameters():
-            if param.requires_grad:
-                total_params += param.numel()
-                # print(f"Parameter: {name}, Size: {param.numel() / 1e6:.6f} M")
-        print(f"Total Trainable param: {total_params / 1e9:.6f} B")
-        # print(f"Total Trainable param: {(sum(p.numel() for p in self.parameters() if p.requires_grad)) / 1e9:.6f} B")
-
-    def forward(
-        self,
-        vision_x: torch.Tensor,
-        lang_x: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cached_vision_x: bool = False,
-        clear_conditioned_layers: bool = True,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> CausalLMOutputWithPast:
-        """
-        Forward pass of Otter.
-
-        Args:
-            vision_x (torch.Tensor): Vision input
-                shape (B, T_img, F, C, H, W) with F=1
-            lang_x (torch.Tensor): Language input ids
-                shape (B, T_txt)
-            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
-            labels (torch.Tensor, optional): Labels. Defaults to None.
-            clear_conditioned_layers: if True, clear the conditioned layers
-                once the foward pass is completed. Set this to false if the
-                same set of images will be reused in another subsequent
-                forward pass.
-            past_key_values: pre-computed values to pass to language model.
-                See past_key_values documentation in Hugging Face
-                CausalLM models.
-            use_cache: whether to use cached key values. See use_cache
-                documentation in Hugging Face CausalLM models.
-        """
-        assert (vision_x is not None) or use_cached_vision_x, "Must provide either vision_x or use_cached_vision_x to True."
-
-        if use_cached_vision_x:
-            # Case: use cached; vision_x should be cached and other
-            # vision-related inputs should not be provided.
-            assert vision_x is None, "Expect vision_x to be None when use_cached_vision_x is True."
-            assert self.lang_encoder.is_conditioned()
-
-        else:
-            # Case: do not use caching (i.e. this is a standard forward pass);
-            self._encode_vision_x(vision_x=vision_x)
-
-        output = self.lang_encoder(
-            input_ids=lang_x,
-            attention_mask=attention_mask,
-            labels=labels,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            **kwargs,
-        )
-
-        if clear_conditioned_layers:
-            self.lang_encoder.clear_conditioned_layers()
-
-        return output
-
-    def _encode_vision_x(self, vision_x: torch.Tensor):
-        """
-        Compute media tokens from vision input by passing it through vision encoder and conditioning language model.
-        Args:
-            vision_x (torch.Tensor): Vision input
-                shape (B, T_img, F, C, H, W)
-                Images in the same chunk are collated along T_img, and frames are collated along F
-                Currently only F=1 is supported (single-frame videos)
-
-        rearrange code based on https://github.com/dhansmair/flamingo-mini
-        """
-
-        assert vision_x.ndim == 6, "vision_x should be of shape (b, T_img, F, C, H, W)"
-        b, T, F = vision_x.shape[:3]
-
-        vision_x = rearrange(vision_x, "b T F c h w -> (b T F) c h w")
-        with torch.no_grad():
-            vision_x = self.vision_encoder(vision_x)[0][:, 1:, :]
-        vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F)
-
-        vision_x = self.perceiver(vision_x)  # reshapes to (b, T, n, d)
-
-        for layer in self.lang_encoder._get_decoder_layers():
-            layer.condition_vis_x(vision_x)
-
-    @torch.no_grad()
-    def generate(
-        self,
-        vision_x: torch.Tensor,
-        lang_x: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        **generate_kwargs,
-    ):
-        """
-        Generate text conditioned on vision and language inputs.
-
-        Args:
-            vision_x (torch.Tensor): Vision input
-                shape (B, T_img, F, C, H, W)
-                images in the same chunk are collated along T_img, and frames are collated along F
-                currently only F=1 is supported (single-frame videos)
-            lang_x (torch.Tensor): Language input
-                shape (B, T_txt)
-            max_length (int, optional): Maximum length of the output. Defaults to None.
-            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
-        Returns:
-            torch.Tensor: lang_x with generated tokens appended to it
-        """
-        if hasattr(self, "_hf_hook"):
-            # add a hook to make sure that the output of lang_encoder is mapped to the same device as the lang_x
-            hook = AlignDevicesHook(
-                execution_device=lang_x.device,
-                io_same_device=True,
-                place_submodules=False,
-            )
-            add_hook_to_module(self.lang_encoder, hook)
-        num_beams = generate_kwargs.get("num_beams", 1)
-        if num_beams > 1:
-            vision_x = vision_x.repeat_interleave(num_beams, dim=0)
-        self._encode_vision_x(vision_x=vision_x)
-        output = self.lang_encoder.generate(
-            input_ids=lang_x,
-            attention_mask=attention_mask,
-            eos_token_id=self.eoc_token_id,
-            **generate_kwargs,
-        )
-
-        self.lang_encoder.clear_conditioned_layers()
-        return output
diff --git a/mllm/src/__init__.py b/mllm/src/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/mllm/src/factory.py b/mllm/src/factory.py
deleted file mode 100644
index e6e12e61cf421a44ea189e882a5a3feaf6bab4b3..0000000000000000000000000000000000000000
--- a/mllm/src/factory.py
+++ /dev/null
@@ -1,167 +0,0 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import open_clip
-
-from .flamingo import Flamingo
-from .flamingo_lm import FlamingoLMMixin
-from .utils import extend_instance
-from .mpt_lora_patch.modeling_mpt import MPTForCausalLM
-
-from peft import (
-    get_peft_model,
-    LoraConfig,
-    get_peft_model_state_dict,
-    PeftConfig,
-    PeftModel
-)
-
-
-def create_model_and_transforms(
-    clip_vision_encoder_path: str,
-    clip_vision_encoder_pretrained: str,
-    clip_vision_encoder_cache_dir: str,
-    lang_encoder_path: str,
-    tokenizer_path: str,
-    cross_attn_every_n_layers: int = 1,
-    use_local_files: bool = False,
-    decoder_layers_attr_name: str = None,
-    freeze_lm_embeddings: bool = False,
-    use_peft = None,
-    peft_config = None,
-    freeze_lang_encoder = True,
-    freeze_perceiver = False,
-    freeze_gated_cross_attn_layers = False,
-    cache_dir = None,
-    max_num_frames = None,
-    **flamingo_kwargs,
-):
-    """
-    Initialize a Flamingo model from a pretrained vision encoder and language encoder.
-    Appends special tokens to the tokenizer and freezes backbones.
-
-    Args:
-        clip_vision_encoder_path (str): path to pretrained clip model (e.g. "ViT-B-32")
-        clip_vision_encoder_pretrained (str): name of pretraining dataset for clip model (e.g. "laion2b_s32b_b79k")
-        lang_encoder_path (str): path to pretrained language encoder
-        tokenizer_path (str): path to pretrained tokenizer
-        cross_attn_every_n_layers (int, optional): determines how often to add a cross-attention layer. Defaults to 1.
-        use_local_files (bool, optional): whether to use local files. Defaults to False.
-        decoder_layers_attr_name (str, optional): name of the decoder layers attribute. Defaults to None.
-    Returns:
-        Flamingo: Flamingo model from pretrained vision and language encoders
-        Image processor: Pipeline to preprocess input images
-        Tokenizer: A tokenizer for the language model
-    """
-    vision_encoder, _, image_processor = open_clip.create_model_and_transforms(
-        clip_vision_encoder_path, pretrained=clip_vision_encoder_pretrained, cache_dir=clip_vision_encoder_cache_dir
-    )
-    # set the vision encoder to output the visual features
-    vision_encoder.visual.output_tokens = True
-
-    text_tokenizer = AutoTokenizer.from_pretrained(
-        tokenizer_path,
-        local_files_only=use_local_files,
-        trust_remote_code=True,
-        cache_dir=cache_dir,
-    )
-
-    # add Flamingo special tokens to the tokenizer
-    if use_peft:
-        text_tokenizer.add_special_tokens(
-            {"additional_special_tokens": ["<|endofchunk|>", "<image>", "<answer>"]}
-        )
-    else:
-        text_tokenizer.add_special_tokens(
-            {"additional_special_tokens": ["<|endofchunk|>", "<image>"]}
-        )
-        
-    if text_tokenizer.pad_token is None:
-        # Issue: GPT models don't have a pad token, which we use to
-        # modify labels for the loss.
-        text_tokenizer.add_special_tokens({"pad_token": "<PAD>"})
-
-    lang_encoder = MPTForCausalLM.from_pretrained(
-        lang_encoder_path,
-        local_files_only=use_local_files,
-        trust_remote_code=True,
-        cache_dir=cache_dir,
-    )
-
-    # hacks for MPT-1B, which doesn't have a get_input_embeddings method
-    if "mpt-1b-redpajama-200b" in lang_encoder_path:
-
-        class EmbeddingFnMixin:
-            def get_input_embeddings(self):
-                return self.transformer.wte
-            def set_input_embeddings(self, new_embeddings):
-                self.transformer.wte = new_embeddings
-        extend_instance(lang_encoder, EmbeddingFnMixin)
-
-    # convert LM to FlamingoLM
-    extend_instance(lang_encoder, FlamingoLMMixin)
-
-    if decoder_layers_attr_name is None:
-        decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
-    lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
-    lang_encoder.resize_token_embeddings(len(text_tokenizer))
-
-    if use_peft: lang_encoder = get_peft_model(lang_encoder, peft_config)
-        # if peft_model_id:
-        #     lang_encoder.load_adapter(peft_model_id, "default")
-
-    model = Flamingo(
-        vision_encoder,
-        lang_encoder,
-        text_tokenizer.encode("<|endofchunk|>")[-1],
-        text_tokenizer.encode("<image>")[-1],
-        vis_dim=open_clip.get_model_config(clip_vision_encoder_path)["vision_cfg"][
-            "width"
-        ],
-        max_num_frames=max_num_frames,
-        cross_attn_every_n_layers=cross_attn_every_n_layers,
-        **flamingo_kwargs,
-    )
-
-    # Freeze all parameters
-    model.requires_grad_(False)
-    assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0
-
-    # Unfreeze lora
-    if use_peft:
-        for n, p in model.named_parameters():
-            if "lora" in n or "prompt_encoder" in n: p.requires_grad = True
-    # Unfreeze perceiver, gated_cross_attn_layers, and LM input embeddings
-    if not freeze_perceiver:
-        model.perceiver.requires_grad_(True)
-    if not freeze_gated_cross_attn_layers:
-        model.lang_encoder.gated_cross_attn_layers.requires_grad_(True)
-    if not freeze_lm_embeddings:
-        model.lang_encoder.get_input_embeddings().requires_grad_(True)
-        # TODO: investigate also training the output embeddings when untied
-
-    print(
-        f"Flamingo model initialized with {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable parameters"
-    )
-
-    return model, image_processor, text_tokenizer
-
-
-def _infer_decoder_layers_attr_name(model):
-    for k in __KNOWN_DECODER_LAYERS_ATTR_NAMES:
-        if k.lower() in model.__class__.__name__.lower():
-            return __KNOWN_DECODER_LAYERS_ATTR_NAMES[k]
-
-    raise ValueError(
-        f"We require the attribute name for the nn.ModuleList in the decoder storing the transformer block layers. Please supply this string manually."
-    )
-
-
-__KNOWN_DECODER_LAYERS_ATTR_NAMES = {
-    "opt": "model.decoder.layers",
-    "gptj": "transformer.h",
-    "gpt-j": "transformer.h",
-    "pythia": "gpt_neox.layers",
-    "llama": "model.layers",
-    "gptneoxforcausallm": "gpt_neox.layers",
-    "mpt": "transformer.blocks",
-    "mosaicgpt": "transformer.blocks",
-}
diff --git a/mllm/src/flamingo.py b/mllm/src/flamingo.py
deleted file mode 100644
index 51a7b5eb83b335442807686c95d68e359b4006fc..0000000000000000000000000000000000000000
--- a/mllm/src/flamingo.py
+++ /dev/null
@@ -1,364 +0,0 @@
-import torch
-from einops import rearrange
-from torch import nn
-from .helpers import PerceiverResampler
-from torch.distributed.fsdp.wrap import (
-    enable_wrap,
-    wrap,
-)
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from torch.distributed.fsdp import (
-    FullyShardedDataParallel as FSDP,
-)
-
-from .utils import apply_with_stopping_condition
-
-
-class Flamingo(nn.Module):
-    def __init__(
-        self,
-        vision_encoder: nn.Module,
-        lang_encoder: nn.Module,
-        eoc_token_id: int,
-        media_token_id: int,
-        vis_dim: int,
-        max_num_frames: int,
-        cross_attn_every_n_layers: int = 1,
-        gradient_checkpointing: bool = False,
-    ):
-        """
-        Args:
-            vision_encoder (nn.Module): HF CLIPModel
-            lang_encoder (nn.Module): HF causal language model
-            eoc_token_id (int): Token id for <|endofchunk|>
-            media_token_id (int): Token id for <image>
-            vis_dim (int): Dimension of the visual features.
-                Visual features are projected to match this shape along the last dimension.
-            cross_attn_every_n_layers (int, optional): How often to apply cross attention after transformer layer. Defaults to 1.
-        """
-        super().__init__()
-        self.eoc_token_id = eoc_token_id
-        self.media_token_id = media_token_id
-        self.vis_dim = vis_dim
-        if hasattr(lang_encoder.config, "d_model"):
-            self.lang_dim = lang_encoder.config.d_model  # mpt uses d_model
-        else:
-            self.lang_dim = lang_encoder.config.hidden_size
-
-        self.vision_encoder = vision_encoder.visual
-        self.perceiver = PerceiverResampler(dim=self.vis_dim, max_num_frames=max_num_frames)
-        self.lang_encoder = lang_encoder
-        self.lang_encoder.init_flamingo(
-            media_token_id=media_token_id,
-            lang_hidden_size=self.lang_dim,
-            vis_hidden_size=self.vis_dim,
-            cross_attn_every_n_layers=cross_attn_every_n_layers,
-            gradient_checkpointing=gradient_checkpointing,
-        )
-        self._use_gradient_checkpointing = gradient_checkpointing
-        self.perceiver._use_gradient_checkpointing = gradient_checkpointing
-        self.device = self.lang_encoder.device
-
-    def forward(
-        self,
-        vision_x: torch.Tensor,
-        lang_x: torch.Tensor,
-        attention_mask: torch.Tensor = None,
-        labels: torch.Tensor = None,
-        media_locations : torch.Tensor = None,
-        clear_conditioned_layers: bool = True,
-        past_key_values=None,
-        use_cache: bool = False,
-    ):
-        """
-        Forward pass of Flamingo.
-
-        Args:
-            vision_x (torch.Tensor): Vision input
-                shape (B, T_img, F, C, H, W) with F=1
-            lang_x (torch.Tensor): Language input ids
-                shape (B, T_txt)
-            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
-            labels (torch.Tensor, optional): Labels. Defaults to None.
-            clear_conditioned_layers: if True, clear the conditioned layers
-                once the foward pass is completed. Set this to false if the
-                same set of images will be reused in another subsequent
-                forward pass.
-            past_key_values: pre-computed values to pass to language model.
-                See past_key_values documentation in Hugging Face
-                CausalLM models.
-            use_cache: whether to use cached key values. See use_cache
-                documentation in Hugging Face CausalLM models.
-        """
-        assert (
-            self.lang_encoder.initialized_flamingo
-        ), "Flamingo layers are not initialized. Please call `init_flamingo` first."
-
-        assert (
-            self.lang_encoder._use_cached_vision_x or vision_x is not None
-        ), "Must provide either vision_x or have precached media using cache_media()."
-
-        if self.lang_encoder._use_cached_vision_x:
-            # Case: use cached; vision_x should be cached and other
-            # vision-related inputs should not be provided.
-            assert (
-                vision_x is None
-            ), "Expect vision_x to be None when media has been cached using cache_media(). Try uncache_media() first."
-            assert self.lang_encoder.is_conditioned()
-
-        else:
-            # Case: do not use caching (i.e. this is a standard forward pass);
-            self._encode_vision_x(vision_x=vision_x)
-            self._condition_media_locations(input_ids=lang_x)
-
-        output = self.lang_encoder(
-            input_ids=lang_x,
-            attention_mask=attention_mask,
-            labels=labels,
-            media_locations=media_locations,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-        )
-
-        if clear_conditioned_layers:
-            self.lang_encoder.clear_conditioned_layers()
-
-        return output
-
-    def generate(
-        self,
-        vision_x: torch.Tensor,
-        lang_x: torch.Tensor,
-        attention_mask: torch.Tensor = None,
-        media_locations: torch.Tensor = None,
-        num_beams=1,
-        min_new_tokens=None,
-        max_new_tokens=None,
-        temperature=1.0,
-        top_k=0,
-        top_p=1.0,
-        no_repeat_ngram_size=0,
-        repetition_penalty=1.0,
-        prefix_allowed_tokens_fn=None,
-        length_penalty=1.0,
-        num_return_sequences=1,
-        do_sample=False,
-        early_stopping=False,
-    ):
-        """
-        Generate text conditioned on vision and language inputs.
-
-        Args:
-            vision_x (torch.Tensor): Vision input
-                shape (B, T_img, F, C, H, W)
-                images in the same chunk are collated along T_img, and frames are collated along F
-                currently only F=1 is supported (single-frame videos)
-            lang_x (torch.Tensor): Language input
-                shape (B, T_txt)
-            max_length (int, optional): Maximum length of the output. Defaults to None.
-            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
-            num_beams (int, optional): Number of beams. Defaults to 1.
-            max_new_tokens (int, optional): Maximum new tokens. Defaults to None.
-            temperature (float, optional): Temperature. Defaults to 1.0.
-            top_k (int, optional): Top k. Defaults to 0.
-            top_p (float, optional): Top p. Defaults to 1.0.
-            no_repeat_ngram_size (int, optional): No repeat ngram size. Defaults to 0.
-            length_penalty (float, optional): Length penalty. Defaults to 1.0.
-            num_return_sequences (int, optional): Number of return sequences. Defaults to 1.
-            do_sample (bool, optional): Do sample. Defaults to False.
-            early_stopping (bool, optional): Early stopping. Defaults to False.
-        Returns:
-            torch.Tensor: lang_x with generated tokens appended to it
-        """
-        if num_beams > 1:
-            vision_x = vision_x.repeat_interleave(num_beams, dim=0)
-
-        self.lang_encoder._use_cached_vision_x = True
-        self._encode_vision_x(vision_x=vision_x)
-
-        output = self.lang_encoder.generate(
-            input_ids=lang_x,
-            attention_mask=attention_mask,
-            media_locations=media_locations,
-            eos_token_id=self.eoc_token_id,
-            num_beams=num_beams,
-            min_new_tokens=min_new_tokens,
-            max_new_tokens=max_new_tokens,
-            temperature=temperature,
-            top_k=top_k,
-            top_p=top_p,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            no_repeat_ngram_size=no_repeat_ngram_size,
-            repetition_penalty=repetition_penalty,
-            length_penalty=length_penalty,
-            num_return_sequences=num_return_sequences,
-            do_sample=do_sample,
-            early_stopping=early_stopping,
-        )
-
-        self.lang_encoder.clear_conditioned_layers()
-        self.lang_encoder._use_cached_vision_x = False
-        return output
-
-    def _encode_vision_x(self, vision_x: torch.Tensor):
-        """
-        Compute media tokens from vision input by passing it through vision encoder and conditioning language model.
-        Args:
-            vision_x (torch.Tensor): Vision input
-                shape (B, T_img, F, C, H, W)
-                Images in the same chunk are collated along T_img, and frames are collated along F
-                Currently only F=1 is supported (single-frame videos)
-
-        rearrange code based on https://github.com/dhansmair/flamingo-mini
-        """
-
-        assert vision_x.ndim == 6, "vision_x should be of shape (b, T_img, F, C, H, W)"
-        b, T, F = vision_x.shape[:3]
-        # assert F == 1, "Only single frame supported"
-
-        vision_x = rearrange(vision_x, "b T F c h w -> (b T F) c h w")
-        with torch.no_grad():
-            vision_x = self.vision_encoder(vision_x)[1]
-        vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F)
-        vision_x = self.perceiver(vision_x)
-
-        for layer in self.lang_encoder._get_decoder_layers():
-            layer.condition_vis_x(vision_x)
-
-    def wrap_fsdp(self, wrapper_kwargs, device_id):
-        """
-        Manually wraps submodules for FSDP and move other parameters to device_id.
-
-        Why manually wrap?
-        - all parameters within the FSDP wrapper must have the same requires_grad.
-            We have a mix of frozen and unfrozen parameters.
-        - model.vision_encoder.visual needs to be individually wrapped or encode_vision_x errors
-            See: https://github.com/pytorch/pytorch/issues/82461#issuecomment-1269136344
-
-        The rough wrapping structure is:
-        - FlamingoModel
-            - FSDP(FSDP(vision_encoder))
-            - FSDP(FSDP(perceiver))
-            - lang_encoder
-                - FSDP(FSDP(input_embeddings))
-                - FlamingoLayers
-                    - FSDP(FSDP(gated_cross_attn_layer))
-                    - FSDP(FSDP(decoder_layer))
-                - FSDP(FSDP(output_embeddings))
-                - other parameters
-
-        Known issues:
-        - Our FSDP strategy is not compatible with tied embeddings. If the LM embeddings are tied,
-            train with DDP or set the --freeze_lm_embeddings flag to true.
-        - With FSDP + gradient ckpting, one can increase the batch size with seemingly no upper bound.
-            Although the training curves look okay, we found that downstream performance dramatically
-            degrades if the batch size is unreasonably large (e.g., 100 MMC4 batch size for OPT-125M).
-
-        FAQs about our FSDP wrapping strategy:
-        Why double wrap?
-        As of torch==2.0.1, FSDP's _post_forward_hook and _post_backward_hook
-        only free gathered parameters if the module is NOT FSDP root.
-
-        Why unfreeze the decoder_layers?
-        See https://github.com/pytorch/pytorch/issues/95805
-        As of torch==2.0.1, FSDP's _post_backward_hook is only registed if the flat param
-        requires_grad=True. We need the postback to fire to avoid OOM.
-        To effectively freeze the decoder layers, we exclude them from the optimizer.
-
-        What is assumed to be frozen v. unfrozen?
-        We assume that the model is being trained under normal Flamingo settings
-        with these lines being called in factory.py:
-            ```
-            # Freeze all parameters
-            model.requires_grad_(False)
-            assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0
-
-            # Unfreeze perceiver, gated_cross_attn_layers, and LM input embeddings
-            model.perceiver.requires_grad_(True)
-            model.lang_encoder.gated_cross_attn_layers.requires_grad_(True)
-            [optional] model.lang_encoder.get_input_embeddings().requires_grad_(True)
-            ```
-        """
-        # unfreeze the decoder layers
-        for block in self.lang_encoder.old_decoder_blocks:
-            block.requires_grad_(True)
-
-        # wrap in FSDP
-        with enable_wrap(wrapper_cls=FSDP, **wrapper_kwargs):
-            self.perceiver = wrap(wrap(self.perceiver))
-            self.lang_encoder.old_decoder_blocks = nn.ModuleList(
-                wrap(wrap(block)) for block in self.lang_encoder.old_decoder_blocks
-            )
-            self.lang_encoder.gated_cross_attn_layers = nn.ModuleList(
-                wrap(wrap(layer)) if layer is not None else None
-                for layer in self.lang_encoder.gated_cross_attn_layers
-            )
-            self.lang_encoder.init_flamingo_layers(self._use_gradient_checkpointing)
-            self.lang_encoder.set_input_embeddings(
-                wrap(wrap(self.lang_encoder.get_input_embeddings()))
-            )
-            self.lang_encoder.set_output_embeddings(
-                wrap(wrap(self.lang_encoder.get_output_embeddings()))
-            )
-            self.vision_encoder = wrap(wrap(self.vision_encoder))  # frozen
-
-        # manually move non-FSDP managed parameters to device_id
-        # these are all in lang_encoder
-        apply_with_stopping_condition(
-            module=self.lang_encoder,
-            apply_fn=lambda m: m.to(device_id),
-            apply_condition=lambda m: len(list(m.children())) == 0,
-            stopping_condition=lambda m: isinstance(m, FSDP),
-        )
-
-        # exclude the original decoder layers from the optimizer
-        for block in self.lang_encoder.old_decoder_blocks:
-            for p in block.parameters():
-                p.exclude_from_optimizer = True
-
-        # set up clip_grad_norm_ function
-        def clip_grad_norm_(max_norm):
-            self.perceiver.clip_grad_norm_(max_norm)
-            for layer in self.lang_encoder.gated_cross_attn_layers:
-                if layer is not None:
-                    layer.clip_grad_norm_(max_norm)
-            self.lang_encoder.get_input_embeddings().clip_grad_norm_(max_norm)
-
-        self.clip_grad_norm_ = clip_grad_norm_
-
-    def _condition_media_locations(self, input_ids: torch.Tensor):
-        """
-        Compute the media token locations from lang_x and condition the language model on these.
-        Args:
-            input_ids (torch.Tensor): Language input
-                shape (B, T_txt)
-        """
-        media_locations = input_ids == self.media_token_id
-
-        for layer in self.lang_encoder._get_decoder_layers():
-            layer.condition_media_locations(media_locations)
-
-    def cache_media(self, input_ids: torch.Tensor, vision_x: torch.Tensor):
-        """
-        Pre-cache a prompt/sequence of images / text for log-likelihood evaluations.
-        All subsequent calls to forward() will generate attending to the LAST
-        image in vision_x.
-        This is not meant to be used to cache things for generate().
-        Args:
-            input_ids (torch.Tensor): Language input
-                shape (B, T_txt)
-            vision_x (torch.Tensor): Vision input
-                shape (B, T_img, F, C, H, W)
-                Images in the same chunk are collated along T_img, and frames are collated along F
-                Currently only F=1 is supported (single-frame videos)
-        """
-        self._encode_vision_x(vision_x=vision_x)
-        self._condition_media_locations(input_ids=input_ids)
-        self.lang_encoder._use_cached_vision_x = True
-
-    def uncache_media(self):
-        """
-        Clear all conditioning.
-        """
-        self.lang_encoder.clear_conditioned_layers()
-        self.lang_encoder._use_cached_vision_x = False
diff --git a/mllm/src/flamingo_lm.py b/mllm/src/flamingo_lm.py
deleted file mode 100644
index 1a91c176c303d4b6d72103da5d738be215ff6068..0000000000000000000000000000000000000000
--- a/mllm/src/flamingo_lm.py
+++ /dev/null
@@ -1,181 +0,0 @@
-import torch.nn as nn
-import torch
-from .helpers import GatedCrossAttentionBlock
-from .utils import getattr_recursive, setattr_recursive
-
-
-class FlamingoLayer(nn.Module):
-    """
-    FlamingoLayer is a wrapper around the GatedCrossAttentionBlock and DecoderLayer.
-    """
-
-    def __init__(
-        self, gated_cross_attn_layer, decoder_layer, gradient_checkpointing=False
-    ):
-        super().__init__()
-        self.gated_cross_attn_layer = gated_cross_attn_layer
-        self.decoder_layer = decoder_layer
-        self.vis_x = None
-        self.media_locations = None
-        if self.gated_cross_attn_layer is not None:
-            self.gated_cross_attn_layer._use_gradient_checkpointing = (
-                gradient_checkpointing
-            )
-        self.decoder_layer._use_gradient_checkpointing = gradient_checkpointing
-
-    def is_conditioned(self) -> bool:
-        """Check whether the layer is conditioned."""
-        return self.vis_x is not None and self.media_locations is not None
-
-    # Used this great idea from this implementation of Flamingo (https://github.com/dhansmair/flamingo-mini/)
-    def condition_vis_x(self, vis_x):
-        self.vis_x = vis_x
-
-    def condition_media_locations(self, media_locations):
-        self.media_locations = media_locations
-
-    def condition_use_cached_media(self, use_cached_media):
-        self.use_cached_media = use_cached_media
-
-    def forward(
-        self,
-        lang_x,
-        attention_mask=None,
-        **decoder_layer_kwargs,
-    ):
-        # Cross attention
-        if self.gated_cross_attn_layer is not None:
-            if self.vis_x is None:
-                raise ValueError("vis_x must be conditioned before forward pass")
-
-            if self.media_locations is None:
-                raise ValueError(
-                    "media_locations must be conditioned before forward pass"
-                )
-
-            lang_x = self.gated_cross_attn_layer(
-                lang_x,
-                self.vis_x,
-                media_locations=self.media_locations,
-                use_cached_media=self.use_cached_media,
-            )
-
-        # Normal decoder layer
-        lang_x = self.decoder_layer(
-            lang_x, attention_mask=attention_mask, **decoder_layer_kwargs
-        )
-        return lang_x
-
-
-class FlamingoLMMixin(nn.Module):
-    """
-    Mixin to add cross-attention layers to a language model.
-    """
-
-    def set_decoder_layers_attr_name(self, decoder_layers_attr_name):
-        self.decoder_layers_attr_name = decoder_layers_attr_name
-
-    def _get_decoder_layers(self):
-        return getattr_recursive(self, self.decoder_layers_attr_name)
-
-    def _set_decoder_layers(self, value):
-        setattr_recursive(self, self.decoder_layers_attr_name, value)
-
-    def init_flamingo(
-        self,
-        media_token_id,
-        lang_hidden_size,
-        vis_hidden_size,
-        cross_attn_every_n_layers,
-        gradient_checkpointing,
-    ):
-        """
-        Initialize Flamingo by adding a new gated cross attn to the decoder. Store the media token id for computing the media locations.
-        """
-        self.old_decoder_blocks = self._get_decoder_layers()
-        self.gated_cross_attn_layers = nn.ModuleList(
-            [
-                GatedCrossAttentionBlock(
-                    dim=lang_hidden_size, dim_visual=vis_hidden_size
-                )
-                if (layer_idx + 1) % cross_attn_every_n_layers == 0
-                else None
-                for layer_idx, _ in enumerate(self._get_decoder_layers())
-            ]
-        )
-        self.init_flamingo_layers(gradient_checkpointing)
-        self.media_token_id = media_token_id
-        self.initialized_flamingo = True
-        self._use_cached_vision_x = False
-
-    def init_flamingo_layers(self, gradient_checkpointing):
-        """
-        Re initializes the FlamingoLayers.
-        Propagates any changes made to self.gated_corss_attn_layers or self.old_decoder_blocks
-        """
-        self._set_decoder_layers(
-            nn.ModuleList(
-                [
-                    FlamingoLayer(
-                        gated_cross_attn_layer, decoder_layer, gradient_checkpointing
-                    )
-                    for gated_cross_attn_layer, decoder_layer in zip(
-                        self.gated_cross_attn_layers, self.old_decoder_blocks
-                    )
-                ]
-            )
-        )
-
-    def forward(self, input_ids=None, attention_mask=None, inputs_embeds=None, media_locations=None, **kwargs):
-        """Condition the Flamingo layers on the media locations before forward()"""
-        if not self.initialized_flamingo:
-            raise ValueError(
-                "Flamingo layers are not initialized. Please call `init_flamingo` first."
-            )
-
-        if media_locations is None:
-            media_locations = input_ids == self.media_token_id
-        if input_ids is not None:
-            assert input_ids.shape[0] == media_locations.shape[0]
-            assert input_ids.shape[1] == media_locations.shape[1]
-        if inputs_embeds is not None:
-            if inputs_embeds.shape[1] != media_locations.shape[1]:
-                generate_tokens = torch.full(
-                    (inputs_embeds.shape[0], inputs_embeds.shape[1] - media_locations.shape[1]), False).to(media_locations.device)
-                media_locations = torch.cat((media_locations, generate_tokens), dim=1)
-            assert inputs_embeds.shape[0] == media_locations.shape[0]
-            assert inputs_embeds.shape[1] == media_locations.shape[1]
-
-
-        # if there are media already cached and we're generating and there are no media tokens in the input,
-        # we'll assume that ALL input tokens should attend to the last previous media that is cached.
-        # this is especially important for HF generate() compatibility, since generate() calls forward()
-        # repeatedly one token at a time (with no media tokens).
-        # without this check, the model would not attend to any images when generating (after the first token)
-        use_cached_media_locations = (
-            self._use_cached_vision_x
-            and self.is_conditioned()
-            and not media_locations.any()
-        )
-
-        for layer in self._get_decoder_layers():
-            if not use_cached_media_locations:
-                layer.condition_media_locations(media_locations)
-            layer.condition_use_cached_media(use_cached_media_locations)
-
-        # package arguments for the other parent's forward. since we don't know the order of the arguments,
-        # make them all kwargs
-        kwargs["input_ids"] = input_ids
-        kwargs["attention_mask"] = attention_mask
-        kwargs["inputs_embeds"] = inputs_embeds
-        return super().forward(**kwargs)  # Call the other parent's forward method
-
-    def is_conditioned(self) -> bool:
-        """Check whether all decoder layers are already conditioned."""
-        return all(l.is_conditioned() for l in self._get_decoder_layers())
-
-    def clear_conditioned_layers(self):
-        for layer in self._get_decoder_layers():
-            layer.condition_vis_x(None)
-            layer.condition_media_locations(None)
-            layer.condition_use_cached_media(None)
diff --git a/mllm/src/helpers.py b/mllm/src/helpers.py
deleted file mode 100644
index 239503f8693c1c94d1441e496c1a6b90e0c25cdb..0000000000000000000000000000000000000000
--- a/mllm/src/helpers.py
+++ /dev/null
@@ -1,279 +0,0 @@
-"""
-Based on: https://github.com/lucidrains/flamingo-pytorch
-"""
-
-import torch
-from einops import rearrange, repeat
-from einops_exts import rearrange_many
-from torch import einsum, nn
-
-
-def exists(val):
-    return val is not None
-
-
-def FeedForward(dim, mult=4):
-    inner_dim = int(dim * mult)
-    return nn.Sequential(
-        nn.LayerNorm(dim),
-        nn.Linear(dim, inner_dim, bias=False),
-        nn.GELU(),
-        nn.Linear(inner_dim, dim, bias=False),
-    )
-
-
-class PerceiverAttention(nn.Module):
-    def __init__(self, *, dim, dim_head=64, heads=8):
-        super().__init__()
-        self.scale = dim_head**-0.5
-        self.heads = heads
-        inner_dim = dim_head * heads
-
-        self.norm_media = nn.LayerNorm(dim)
-        self.norm_latents = nn.LayerNorm(dim)
-
-        self.to_q = nn.Linear(dim, inner_dim, bias=False)
-        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
-        self.to_out = nn.Linear(inner_dim, dim, bias=False)
-
-    def forward(self, x, latents):
-        """
-        Args:
-            x (torch.Tensor): image features
-                shape (b, T, n1, D)
-            latent (torch.Tensor): latent features
-                shape (b, T, n2, D)
-        """
-        x = self.norm_media(x)
-        latents = self.norm_latents(latents)
-
-        h = self.heads
-
-        q = self.to_q(latents)
-        kv_input = torch.cat((x, latents), dim=-2)
-        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
-        q, k, v = rearrange_many((q, k, v), "b t n (h d) -> b h t n d", h=h)
-        q = q * self.scale
-
-        # attention
-        sim = einsum("... i d, ... j d  -> ... i j", q, k)
-        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
-        attn = sim.softmax(dim=-1)
-
-        out = einsum("... i j, ... j d -> ... i d", attn, v)
-        out = rearrange(out, "b h t n d -> b t n (h d)", h=h)
-        return self.to_out(out)
-
-
-class PerceiverResampler(nn.Module):
-    def __init__(
-        self,
-        *,
-        dim,
-        depth=6,
-        dim_head=64,
-        heads=8,
-        num_latents=64,
-        max_num_media=None,
-        max_num_frames=None,
-        ff_mult=4,
-    ):
-        super().__init__()
-        self.latents = nn.Parameter(torch.randn(num_latents, dim))
-        self.frame_embs = (
-            nn.Parameter(torch.randn(max_num_frames, dim))
-            if exists(max_num_frames)
-            else None
-        )
-        self.media_time_embs = (
-            nn.Parameter(torch.randn(max_num_media, 1, dim))
-            if exists(max_num_media)
-            else None
-        )
-
-        self.layers = nn.ModuleList([])
-        for _ in range(depth):
-            self.layers.append(
-                nn.ModuleList(
-                    [
-                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
-                        FeedForward(dim=dim, mult=ff_mult),
-                    ]
-                )
-            )
-
-        self.norm = nn.LayerNorm(dim)
-
-    def forward(self, x):
-        """
-        Args:
-            x (torch.Tensor): image features
-                shape (b, T, F, v, D)
-        Returns:
-            shape (b, T, n, D) where n is self.num_latents
-        """
-        b, T, F, v = x.shape[:4]
-
-        # frame and media time embeddings
-        if exists(self.frame_embs):
-            frame_embs = repeat(self.frame_embs[:F], "F d -> b T F v d", b=b, T=T, v=v)
-            x = x + frame_embs
-        x = rearrange(
-            x, "b T F v d -> b T (F v) d"
-        )  # flatten the frame and spatial dimensions
-        if exists(self.media_time_embs):
-            x = x + self.media_time_embs[:T]
-
-        # blocks
-        latents = repeat(self.latents, "n d -> b T n d", b=b, T=T)
-        for attn, ff in self.layers:
-            latents = attn(x, latents) + latents
-            latents = ff(latents) + latents
-        return self.norm(latents)
-
-
-# gated cross attention
-class MaskedCrossAttention(nn.Module):
-    def __init__(
-        self,
-        *,
-        dim,
-        dim_visual,
-        dim_head=64,
-        heads=8,
-        only_attend_immediate_media=True,
-    ):
-        super().__init__()
-        self.scale = dim_head**-0.5
-        self.heads = heads
-        inner_dim = dim_head * heads
-
-        self.norm = nn.LayerNorm(dim)
-
-        self.to_q = nn.Linear(dim, inner_dim, bias=False)
-        self.to_kv = nn.Linear(dim_visual, inner_dim * 2, bias=False)
-        self.to_out = nn.Linear(inner_dim, dim, bias=False)
-
-        # whether for text to only attend to immediate preceding image, or all previous images
-        self.only_attend_immediate_media = only_attend_immediate_media
-
-    def forward(self, x, media, media_locations=None, use_cached_media=False):
-        """
-        Args:
-            x (torch.Tensor): text features
-                shape (B, T_txt, D_txt)
-            media (torch.Tensor): image features
-                shape (B, T_img, n, D_img) where n is the dim of the latents
-            media_locations: boolean mask identifying the media tokens in x
-                shape (B, T_txt)
-            use_cached_media: bool
-                If true, treat all of x as if they occur after the last media
-                registered in media_locations. T_txt does not need to exactly
-                equal media_locations.shape[1] in this case
-        """
-
-        if not use_cached_media:
-            assert (
-                media_locations.shape[1] == x.shape[1]
-            ), f"media_location.shape is {media_locations.shape} but x.shape is {x.shape}"
-
-        T_txt = x.shape[1]
-        _, T_img, n = media.shape[:3]
-        h = self.heads
-
-        x = self.norm(x)
-
-        q = self.to_q(x)
-        media = rearrange(media, "b t n d -> b (t n) d")
-
-        k, v = self.to_kv(media).chunk(2, dim=-1)
-        q, k, v = rearrange_many((q, k, v), "b n (h d) -> b h n d", h=h)
-
-        q = q * self.scale
-
-        sim = einsum("... i d, ... j d -> ... i j", q, k)
-
-        if exists(media_locations):
-            media_time = torch.arange(T_img, device=x.device) + 1
-
-            if use_cached_media:
-                # text time is set to the last cached media location
-                text_time = repeat(
-                    torch.count_nonzero(media_locations, dim=1),
-                    "b -> b i",
-                    i=T_txt,
-                )
-            else:
-                # at each boolean of True, increment the time counter (relative to media time)
-                text_time = media_locations.cumsum(dim=-1)
-
-            # text time must equal media time if only attending to most immediate image
-            # otherwise, as long as text time is greater than media time (if attending to all previous images / media)
-            mask_op = torch.eq if self.only_attend_immediate_media else torch.ge
-
-            text_to_media_mask = mask_op(
-                rearrange(text_time, "b i -> b 1 i 1"),
-                repeat(media_time, "j -> 1 1 1 (j n)", n=n),
-            )
-            sim = sim.masked_fill(~text_to_media_mask, -torch.finfo(sim.dtype).max)
-
-        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
-        attn = sim.softmax(dim=-1)
-
-        if exists(media_locations) and self.only_attend_immediate_media:
-            # any text without a preceding media needs to have attention zeroed out
-            text_without_media_mask = text_time == 0
-            text_without_media_mask = rearrange(
-                text_without_media_mask, "b i -> b 1 i 1"
-            )
-            attn = attn.masked_fill(text_without_media_mask, 0.0)
-
-        out = einsum("... i j, ... j d -> ... i d", attn, v)
-        out = rearrange(out, "b h n d -> b n (h d)")
-        return self.to_out(out)
-
-
-class GatedCrossAttentionBlock(nn.Module):
-    def __init__(
-        self,
-        *,
-        dim,
-        dim_visual,
-        dim_head=64,
-        heads=8,
-        ff_mult=4,
-        only_attend_immediate_media=True,
-    ):
-        super().__init__()
-        self.attn = MaskedCrossAttention(
-            dim=dim,
-            dim_visual=dim_visual,
-            dim_head=dim_head,
-            heads=heads,
-            only_attend_immediate_media=only_attend_immediate_media,
-        )
-        self.attn_gate = nn.Parameter(torch.tensor([0.0]))
-
-        self.ff = FeedForward(dim, mult=ff_mult)
-        self.ff_gate = nn.Parameter(torch.tensor([0.0]))
-
-    def forward(
-        self,
-        x,
-        media,
-        media_locations=None,
-        use_cached_media=False,
-    ):
-        x = (
-            self.attn(
-                x,
-                media,
-                media_locations=media_locations,
-                use_cached_media=use_cached_media,
-            )
-            * self.attn_gate.tanh()
-            + x
-        )
-        x = self.ff(x) * self.ff_gate.tanh() + x
-
-        return x
diff --git a/mllm/src/mpt_lora_patch/README.md b/mllm/src/mpt_lora_patch/README.md
deleted file mode 100644
index ecc47b5d96b5998be95eee75acd2e95d75384bd6..0000000000000000000000000000000000000000
--- a/mllm/src/mpt_lora_patch/README.md
+++ /dev/null
@@ -1,68 +0,0 @@
-# MPT-7B LoRA Patch
-
-This is the Python model code for MPT-7B patched so that it can be used with a LoRA. Note that while I tested that it works and I get reasonable results out, it is very possible that the model isn't being trained correctly. The model code specifically says that left padding is not supported, but I forcibly did so and got decent results.
-
-Note that when using LoRA, there is a strange quirk that prevents me from causing generation with an empty prompt.
-
-I also included a model-agnostic `export_hf_checkpoint.py` script, which you can use to merge your lora back into a new full model. Once you do this, you do not need to use the patched version of the model code anymore. That being said, if you want to be able to load the model in 8bit you will still need it. The usage is `python export_hf_checkpoint.py <source> <lora> <dest>`.
-
-If you would like to use this with `text-generation-webui`, apply the following patch:
-
-```patch
---- a/modules/training.py
-+++ b/modules/training.py
-@@ -28,12 +28,13 @@ try:
-     MODEL_CLASSES = {v: k for k, v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES}
- except:
-     standard_modules = ["q_proj", "v_proj"]
--    model_to_lora_modules = {"llama": standard_modules, "opt": standard_modules, "gptj": standard_modules, "gpt_neox": ["query_key_value"]}
-+    model_to_lora_modules = {"llama": standard_modules, "opt": standard_modules, "gptj": standard_modules, "gpt_neox": ["query_key_value"], "mpt": ["Wqkv"]}
-     MODEL_CLASSES = {
-         "LlamaForCausalLM": "llama",
-         "OPTForCausalLM": "opt",
-         "GPTJForCausalLM": "gptj",
--        "GPTNeoXForCausalLM": "gpt_neox"
-+        "GPTNeoXForCausalLM": "gpt_neox",
-+        "MPTForCausalLM": "mpt"
-     }
-
- WANT_INTERRUPT = False
-```
-
-You will need to run the webui with these options:
-
-```bash
-python server.py --model mosaicml_mpt-7b-instruct --trust-remote-code --load-in-8bit
-```
-
-You may also need to patch `bitsandbytes/nn/modules.py` to prevent running out of VRAM when saving the LoRA:
-
-```patch
---- a/modules.py
-+++ b/modules.py
-@@ -259,13 +259,13 @@
-         if not self.state.has_fp16_weights and self.state.CB is None and self.state.CxB is not None:
-             # reorder weight layout back from ampere/turing to row
-             reorder_layout = True
--            weight_clone = self.weight.data.clone()
-+            weight_clone = self.weight.data
-         else:
-             reorder_layout = False
-
-         try:
-             if reorder_layout:
--                self.weight.data = undo_layout(self.state.CxB, self.state.tile_indices)
-+                self.weight.data = undo_layout(self.state.CxB.cpu(), self.state.tile_indices.cpu())
-
-             super()._save_to_state_dict(destination, prefix, keep_vars)
-```
-
-(It resides in `miniconda3/envs/textgen/lib/python3.10/site-packages/bitsandbytes/nn/modules.py` for me.)
-
-You can find the source model here: [mosaicml/mpt-7b-instruct](https://huggingface.co/mosaicml/mpt-7b-instruct)
-
-The alterations are based on the [source code for the llama model](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py) from HF Transformers.
-
-## Model License
-
-CC-By-SA-3.0
diff --git a/mllm/src/mpt_lora_patch/__pycache__/adapt_tokenizer.cpython-310.pyc b/mllm/src/mpt_lora_patch/__pycache__/adapt_tokenizer.cpython-310.pyc
deleted file mode 100644
index e8f564d93ae4c5e5771b17bfb94ab3e7346fbdd0..0000000000000000000000000000000000000000
Binary files a/mllm/src/mpt_lora_patch/__pycache__/adapt_tokenizer.cpython-310.pyc and /dev/null differ
diff --git a/mllm/src/mpt_lora_patch/__pycache__/adapt_tokenizer.cpython-39.pyc b/mllm/src/mpt_lora_patch/__pycache__/adapt_tokenizer.cpython-39.pyc
deleted file mode 100644
index 9ac7c0e6b96cc96400f9a198bbe72ade92726afe..0000000000000000000000000000000000000000
Binary files a/mllm/src/mpt_lora_patch/__pycache__/adapt_tokenizer.cpython-39.pyc and /dev/null differ
diff --git a/mllm/src/mpt_lora_patch/__pycache__/attention.cpython-310.pyc b/mllm/src/mpt_lora_patch/__pycache__/attention.cpython-310.pyc
deleted file mode 100644
index bfa77edcb3cf0e8d745da4129e80282b34534984..0000000000000000000000000000000000000000
Binary files a/mllm/src/mpt_lora_patch/__pycache__/attention.cpython-310.pyc and /dev/null differ
diff --git a/mllm/src/mpt_lora_patch/__pycache__/attention.cpython-39.pyc b/mllm/src/mpt_lora_patch/__pycache__/attention.cpython-39.pyc
deleted file mode 100644
index 85f169275003f9af0f545f7042f3841e9aad834f..0000000000000000000000000000000000000000
Binary files a/mllm/src/mpt_lora_patch/__pycache__/attention.cpython-39.pyc and /dev/null differ
diff --git a/mllm/src/mpt_lora_patch/__pycache__/blocks.cpython-310.pyc b/mllm/src/mpt_lora_patch/__pycache__/blocks.cpython-310.pyc
deleted file mode 100644
index fea46259dd3f9ce80f29fed6803b84456c1693fb..0000000000000000000000000000000000000000
Binary files a/mllm/src/mpt_lora_patch/__pycache__/blocks.cpython-310.pyc and /dev/null differ
diff --git a/mllm/src/mpt_lora_patch/__pycache__/blocks.cpython-39.pyc b/mllm/src/mpt_lora_patch/__pycache__/blocks.cpython-39.pyc
deleted file mode 100644
index 83a430b1ff944eb0cf6767a14ea6c5b94e088367..0000000000000000000000000000000000000000
Binary files a/mllm/src/mpt_lora_patch/__pycache__/blocks.cpython-39.pyc and /dev/null differ
diff --git a/mllm/src/mpt_lora_patch/__pycache__/configuration_mpt.cpython-310.pyc b/mllm/src/mpt_lora_patch/__pycache__/configuration_mpt.cpython-310.pyc
deleted file mode 100644
index ef27f56c03e051fdfce939e85cc23f67c5f91482..0000000000000000000000000000000000000000
Binary files a/mllm/src/mpt_lora_patch/__pycache__/configuration_mpt.cpython-310.pyc and /dev/null differ
diff --git a/mllm/src/mpt_lora_patch/__pycache__/configuration_mpt.cpython-39.pyc b/mllm/src/mpt_lora_patch/__pycache__/configuration_mpt.cpython-39.pyc
deleted file mode 100644
index d8f7c2e75c064d39ec20b25f3f90f26d32882b77..0000000000000000000000000000000000000000
Binary files a/mllm/src/mpt_lora_patch/__pycache__/configuration_mpt.cpython-39.pyc and /dev/null differ
diff --git a/mllm/src/mpt_lora_patch/__pycache__/hf_prefixlm_converter.cpython-310.pyc b/mllm/src/mpt_lora_patch/__pycache__/hf_prefixlm_converter.cpython-310.pyc
deleted file mode 100644
index 2884b1d8f44d01a0cca1c209f7b6c06987f73802..0000000000000000000000000000000000000000
Binary files a/mllm/src/mpt_lora_patch/__pycache__/hf_prefixlm_converter.cpython-310.pyc and /dev/null differ
diff --git a/mllm/src/mpt_lora_patch/__pycache__/hf_prefixlm_converter.cpython-39.pyc b/mllm/src/mpt_lora_patch/__pycache__/hf_prefixlm_converter.cpython-39.pyc
deleted file mode 100644
index 7fc80bdc29cb62e9e9d272e8295cc71ad8192041..0000000000000000000000000000000000000000
Binary files a/mllm/src/mpt_lora_patch/__pycache__/hf_prefixlm_converter.cpython-39.pyc and /dev/null differ
diff --git a/mllm/src/mpt_lora_patch/__pycache__/meta_init_context.cpython-310.pyc b/mllm/src/mpt_lora_patch/__pycache__/meta_init_context.cpython-310.pyc
deleted file mode 100644
index 6abbcd95c08bc5afd02c78daaf860839b7b9ef1c..0000000000000000000000000000000000000000
Binary files a/mllm/src/mpt_lora_patch/__pycache__/meta_init_context.cpython-310.pyc and /dev/null differ
diff --git a/mllm/src/mpt_lora_patch/__pycache__/meta_init_context.cpython-39.pyc b/mllm/src/mpt_lora_patch/__pycache__/meta_init_context.cpython-39.pyc
deleted file mode 100644
index 86690d034400d68d90a1dc3d398b84df1760c4dd..0000000000000000000000000000000000000000
Binary files a/mllm/src/mpt_lora_patch/__pycache__/meta_init_context.cpython-39.pyc and /dev/null differ
diff --git a/mllm/src/mpt_lora_patch/__pycache__/modeling_mpt.cpython-310.pyc b/mllm/src/mpt_lora_patch/__pycache__/modeling_mpt.cpython-310.pyc
deleted file mode 100644
index 7b60ee850591576c9663dd3c8615e18266dc61e4..0000000000000000000000000000000000000000
Binary files a/mllm/src/mpt_lora_patch/__pycache__/modeling_mpt.cpython-310.pyc and /dev/null differ
diff --git a/mllm/src/mpt_lora_patch/__pycache__/modeling_mpt.cpython-39.pyc b/mllm/src/mpt_lora_patch/__pycache__/modeling_mpt.cpython-39.pyc
deleted file mode 100644
index f8cd56b2a20c38cf437320e616c14515e4868a69..0000000000000000000000000000000000000000
Binary files a/mllm/src/mpt_lora_patch/__pycache__/modeling_mpt.cpython-39.pyc and /dev/null differ
diff --git a/mllm/src/mpt_lora_patch/__pycache__/norm.cpython-310.pyc b/mllm/src/mpt_lora_patch/__pycache__/norm.cpython-310.pyc
deleted file mode 100644
index e876ab80eba9597b5388a083c1435b54819607e5..0000000000000000000000000000000000000000
Binary files a/mllm/src/mpt_lora_patch/__pycache__/norm.cpython-310.pyc and /dev/null differ
diff --git a/mllm/src/mpt_lora_patch/__pycache__/norm.cpython-39.pyc b/mllm/src/mpt_lora_patch/__pycache__/norm.cpython-39.pyc
deleted file mode 100644
index 9c9f51ef3deaccc12a3e301430ce564ae18a63f3..0000000000000000000000000000000000000000
Binary files a/mllm/src/mpt_lora_patch/__pycache__/norm.cpython-39.pyc and /dev/null differ
diff --git a/mllm/src/mpt_lora_patch/__pycache__/param_init_fns.cpython-310.pyc b/mllm/src/mpt_lora_patch/__pycache__/param_init_fns.cpython-310.pyc
deleted file mode 100644
index fec030bc79de2c8c3565c77a6a53f25a6eec72e1..0000000000000000000000000000000000000000
Binary files a/mllm/src/mpt_lora_patch/__pycache__/param_init_fns.cpython-310.pyc and /dev/null differ
diff --git a/mllm/src/mpt_lora_patch/__pycache__/param_init_fns.cpython-39.pyc b/mllm/src/mpt_lora_patch/__pycache__/param_init_fns.cpython-39.pyc
deleted file mode 100644
index f249829e78b4cdea7b81c16b5790624c80efe775..0000000000000000000000000000000000000000
Binary files a/mllm/src/mpt_lora_patch/__pycache__/param_init_fns.cpython-39.pyc and /dev/null differ
diff --git a/mllm/src/mpt_lora_patch/adapt_tokenizer.py b/mllm/src/mpt_lora_patch/adapt_tokenizer.py
deleted file mode 100644
index e640c157e8f5581953c518df0611a423225ef598..0000000000000000000000000000000000000000
--- a/mllm/src/mpt_lora_patch/adapt_tokenizer.py
+++ /dev/null
@@ -1,41 +0,0 @@
-from typing import Union
-from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
-Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
-NUM_SENTINEL_TOKENS: int = 100
-
-def adapt_tokenizer_for_denoising(tokenizer: Tokenizer):
-    """Adds sentinel tokens and padding token (if missing).
-
-    Expands the tokenizer vocabulary to include sentinel tokens
-    used in mixture-of-denoiser tasks as well as a padding token.
-
-    All added tokens are added as special tokens. No tokens are
-    added if sentinel tokens and padding token already exist.
-    """
-    sentinels_to_add = [f'<extra_id_{i}>' for i in range(NUM_SENTINEL_TOKENS)]
-    tokenizer.add_tokens(sentinels_to_add, special_tokens=True)
-    if tokenizer.pad_token is None:
-        tokenizer.add_tokens('<pad>', special_tokens=True)
-        tokenizer.pad_token = '<pad>'
-        assert tokenizer.pad_token_id is not None
-    sentinels = ''.join([f'<extra_id_{i}>' for i in range(NUM_SENTINEL_TOKENS)])
-    _sentinel_token_ids = tokenizer(sentinels, add_special_tokens=False).input_ids
-    tokenizer.sentinel_token_ids = _sentinel_token_ids
-
-class AutoTokenizerForMOD(AutoTokenizer):
-    """AutoTokenizer + Adaptation for MOD.
-
-    A simple wrapper around AutoTokenizer to make instantiating
-    an MOD-adapted tokenizer a bit easier.
-
-    MOD-adapted tokenizers have sentinel tokens (e.g., <extra_id_0>),
-    a padding token, and a property to get the token ids of the
-    sentinel tokens.
-    """
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        """See `AutoTokenizer.from_pretrained` docstring."""
-        tokenizer = super().from_pretrained(*args, **kwargs)
-        adapt_tokenizer_for_denoising(tokenizer)
-        return tokenizer
\ No newline at end of file
diff --git a/mllm/src/mpt_lora_patch/attention.py b/mllm/src/mpt_lora_patch/attention.py
deleted file mode 100644
index 2ca1069cd14ca055d918fa623d7da5efb4c5fd89..0000000000000000000000000000000000000000
--- a/mllm/src/mpt_lora_patch/attention.py
+++ /dev/null
@@ -1,276 +0,0 @@
-"""Attention layers."""
-import math
-import warnings
-from typing import Optional
-import torch
-import torch.nn as nn
-from einops import rearrange
-from torch import nn
-from .norm import LPLayerNorm
-
-def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool):
-    if original_is_causal and num_query_tokens != num_key_tokens:
-        if num_query_tokens != 1:
-            raise NotImplementedError('MPT does not support query and key with different number of tokens, unless number of query tokens is 1.')
-        else:
-            return False
-    return original_is_causal
-
-def scaled_multihead_dot_product_attention(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
-    q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
-    k = rearrange(key, 'b s (h d) -> b h d s', h=1 if multiquery else n_heads)
-    v = rearrange(value, 'b s (h d) -> b h s d', h=1 if multiquery else n_heads)
-    min_val = torch.finfo(q.dtype).min
-    (b, _, s_q, d) = q.shape
-    s_k = k.size(-1)
-    if softmax_scale is None:
-        softmax_scale = 1 / math.sqrt(d)
-    attn_weight = q.matmul(k) * softmax_scale
-    if attn_bias is not None:
-        if attn_bias.size(-1) != 1 and attn_bias.size(-1) != s_k or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q):
-            raise RuntimeError(f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.')
-        attn_weight = attn_weight + attn_bias
-    if key_padding_mask is not None:
-        if attn_bias is not None:
-            warnings.warn('Propogating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unneccessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
-        attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
-    if is_causal:
-        s = max(s_q, s_k)
-        causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
-        causal_mask = causal_mask.tril()
-        causal_mask = causal_mask.to(torch.bool)
-        causal_mask = ~causal_mask
-        causal_mask = causal_mask[-s_q:, -s_k:]
-        attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k), min_val)
-    attn_weight = torch.softmax(attn_weight, dim=-1)
-    if dropout_p:
-        attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
-    out = attn_weight.matmul(v)
-    out = rearrange(out, 'b h s d -> b s (h d)')
-    if needs_weights:
-        return (out, attn_weight)
-    return (out, None)
-
-def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
-    for tensor in tensors:
-        if tensor.dtype not in valid_dtypes:
-            raise TypeError(f'tensor.dtype={tensor.dtype!r} must be in valid_dtypes={valid_dtypes!r}.')
-        if not tensor.is_cuda:
-            raise TypeError(f'Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r}).')
-
-def flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
-    try:
-        from flash_attn import bert_padding, flash_attn_interface
-    except:
-        raise RuntimeError('Please install flash-attn==1.0.3.post0')
-    check_valid_inputs(query, key, value)
-    if attn_bias is not None:
-        raise NotImplementedError(f'attn_bias not implemented for flash attn.')
-    (batch_size, seqlen) = query.shape[:2]
-    if key_padding_mask is None:
-        key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
-    query_padding_mask = key_padding_mask[:, -query.size(1):]
-    (query_unpad, indices_q, cu_seqlens_q, max_seqlen_q) = bert_padding.unpad_input(query, query_padding_mask)
-    query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
-    (key_unpad, _, cu_seqlens_k, max_seqlen_k) = bert_padding.unpad_input(key, key_padding_mask)
-    key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=1 if multiquery else n_heads)
-    (value_unpad, _, _, _) = bert_padding.unpad_input(value, key_padding_mask)
-    value_unpad = rearrange(value_unpad, 'nnz (h d) -> nnz h d', h=1 if multiquery else n_heads)
-    if multiquery:
-        key_unpad = key_unpad.expand(key_unpad.size(0), n_heads, key_unpad.size(-1))
-        value_unpad = value_unpad.expand(value_unpad.size(0), n_heads, value_unpad.size(-1))
-    dropout_p = dropout_p if training else 0.0
-    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
-    output_unpad = flash_attn_interface.flash_attn_unpadded_func(query_unpad, key_unpad, value_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout_p, softmax_scale=softmax_scale, causal=reset_is_causal, return_attn_probs=needs_weights)
-    output = bert_padding.pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size, seqlen)
-    return (output, None)
-
-def triton_flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
-    try:
-        from flash_attn import flash_attn_triton
-    except:
-        raise RuntimeError('Please install flash-attn==1.0.3.post0 and triton==2.0.0.dev20221202')
-    check_valid_inputs(query, key, value)
-    if dropout_p:
-        raise NotImplementedError(f'Dropout not implemented for attn_impl: triton.')
-    if needs_weights:
-        raise NotImplementedError(f'attn_impl: triton cannot return attn weights.')
-    if key_padding_mask is not None:
-        warnings.warn('Propagating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unnecessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
-        (b_size, s_k) = key_padding_mask.shape[:2]
-        if attn_bias is None:
-            attn_bias = query.new_zeros(b_size, 1, 1, s_k)
-        attn_bias = attn_bias.masked_fill(~key_padding_mask.view((b_size, 1, 1, s_k)), torch.finfo(query.dtype).min)
-    query = rearrange(query, 'b s (h d) -> b s h d', h=n_heads)
-    key = rearrange(key, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
-    value = rearrange(value, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
-    if multiquery:
-        key = key.expand(*key.shape[:2], n_heads, key.size(-1))
-        value = value.expand(*value.shape[:2], n_heads, value.size(-1))
-    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
-    attn_output = flash_attn_triton.flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
-    output = attn_output.view(*attn_output.shape[:2], -1)
-    return (output, None)
-
-class MultiheadAttention(nn.Module):
-    """Multi-head self attention.
-
-    Using torch or triton attention implemetation enables user to also use
-    additive bias.
-    """
-
-    def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, device: Optional[str]=None):
-        super().__init__()
-        self.attn_impl = attn_impl
-        self.clip_qkv = clip_qkv
-        self.qk_ln = qk_ln
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.softmax_scale = softmax_scale
-        if self.softmax_scale is None:
-            self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
-        self.attn_dropout_p = attn_pdrop
-        self.Wqkv = nn.Linear(self.d_model, 3 * self.d_model, device=device)
-        fuse_splits = (d_model, 2 * d_model)
-        self.Wqkv._fused = (0, fuse_splits)
-        if self.qk_ln:
-            layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
-            self.q_ln = layernorm_class(self.d_model, device=device)
-            self.k_ln = layernorm_class(self.d_model, device=device)
-        if self.attn_impl == 'flash':
-            self.attn_fn = flash_attn_fn
-        elif self.attn_impl == 'triton':
-            self.attn_fn = triton_flash_attn_fn
-            warnings.warn('While `attn_impl: triton` can be faster than `attn_impl: flash` ' + 'it uses more memory. When training larger models this can trigger ' + 'alloc retries which hurts performance. If encountered, we recommend ' + 'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.')
-        elif self.attn_impl == 'torch':
-            self.attn_fn = scaled_multihead_dot_product_attention
-            if torch.cuda.is_available():
-                warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
-        else:
-            raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
-        self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
-        self.out_proj._is_residual = True
-
-    def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
-        qkv = self.Wqkv(x)
-        if self.clip_qkv:
-            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
-        (query, key, value) = qkv.chunk(3, dim=2)
-        key_padding_mask = attention_mask
-        if self.qk_ln:
-            dtype = query.dtype
-            query = self.q_ln(query).to(dtype)
-            key = self.k_ln(key).to(dtype)
-        if past_key_value is not None:
-            if len(past_key_value) != 0:
-                key = torch.cat([past_key_value[0], key], dim=1)
-                value = torch.cat([past_key_value[1], value], dim=1)
-            past_key_value = (key, value)
-        if attn_bias is not None:
-            attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
-        (context, attn_weights) = self.attn_fn(query, key, value, self.n_heads, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights)
-        return (self.out_proj(context), attn_weights, past_key_value)
-
-class MultiQueryAttention(nn.Module):
-    """Multi-Query self attention.
-
-    Using torch or triton attention implemetation enables user to also use
-    additive bias.
-    """
-
-    def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, device: Optional[str]=None):
-        super().__init__()
-        self.attn_impl = attn_impl
-        self.clip_qkv = clip_qkv
-        self.qk_ln = qk_ln
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.head_dim = d_model // n_heads
-        self.softmax_scale = softmax_scale
-        if self.softmax_scale is None:
-            self.softmax_scale = 1 / math.sqrt(self.head_dim)
-        self.attn_dropout_p = attn_pdrop
-        self.Wqkv = nn.Linear(d_model, d_model + 2 * self.head_dim, device=device)
-        fuse_splits = (d_model, d_model + self.head_dim)
-        self.Wqkv._fused = (0, fuse_splits)
-        if self.qk_ln:
-            layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
-            self.q_ln = layernorm_class(d_model, device=device)
-            self.k_ln = layernorm_class(self.head_dim, device=device)
-        if self.attn_impl == 'flash':
-            self.attn_fn = flash_attn_fn
-        elif self.attn_impl == 'triton':
-            self.attn_fn = triton_flash_attn_fn
-            warnings.warn('While `attn_impl: triton` can be faster than `attn_impl: flash` ' + 'it uses more memory. When training larger models this can trigger ' + 'alloc retries which hurts performance. If encountered, we recommend ' + 'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.')
-        elif self.attn_impl == 'torch':
-            self.attn_fn = scaled_multihead_dot_product_attention
-            if torch.cuda.is_available():
-                warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
-        else:
-            raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
-        self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
-        self.out_proj._is_residual = True
-
-    def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
-        qkv = self.Wqkv(x)
-        if self.clip_qkv:
-            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
-        (query, key, value) = qkv.split([self.d_model, self.head_dim, self.head_dim], dim=2)
-        key_padding_mask = attention_mask
-        if self.qk_ln:
-            dtype = query.dtype
-            query = self.q_ln(query).to(dtype)
-            key = self.k_ln(key).to(dtype)
-        if past_key_value is not None:
-            if len(past_key_value) != 0:
-                key = torch.cat([past_key_value[0], key], dim=1)
-                value = torch.cat([past_key_value[1], value], dim=1)
-            past_key_value = (key, value)
-        if attn_bias is not None:
-            attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
-        (context, attn_weights) = self.attn_fn(query, key, value, self.n_heads, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights, multiquery=True)
-        return (self.out_proj(context), attn_weights, past_key_value)
-
-def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id):
-    if attn_impl == 'flash':
-        return None
-    elif attn_impl in ['torch', 'triton']:
-        if alibi:
-            if (prefix_lm or not causal) or use_sequence_id:
-                return (1, n_heads, seq_len, seq_len)
-            return (1, n_heads, 1, seq_len)
-        elif prefix_lm or use_sequence_id:
-            return (1, 1, seq_len, seq_len)
-        return None
-    else:
-        raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
-
-def build_attn_bias(attn_impl, attn_bias, n_heads, seq_len, causal=False, alibi=False, alibi_bias_max=8):
-    if attn_impl == 'flash':
-        return None
-    elif attn_impl in ['torch', 'triton']:
-        if alibi:
-            (device, dtype) = (attn_bias.device, attn_bias.dtype)
-            attn_bias = attn_bias.add(build_alibi_bias(n_heads, seq_len, full=not causal, alibi_bias_max=alibi_bias_max, device=device, dtype=dtype))
-        return attn_bias
-    else:
-        raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
-
-def gen_slopes(n_heads, alibi_bias_max=8, device=None):
-    _n_heads = 2 ** math.ceil(math.log2(n_heads))
-    m = torch.arange(1, _n_heads + 1, dtype=torch.float32, device=device)
-    m = m.mul(alibi_bias_max / _n_heads)
-    slopes = 1.0 / torch.pow(2, m)
-    if _n_heads != n_heads:
-        slopes = torch.concat([slopes[1::2], slopes[::2]])[:n_heads]
-    return slopes.view(1, n_heads, 1, 1)
-
-def build_alibi_bias(n_heads, seq_len, full=False, alibi_bias_max=8, device=None, dtype=None):
-    alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, 1, seq_len)
-    if full:
-        alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, seq_len, 1)
-        alibi_bias = alibi_bias.abs().mul(-1)
-    slopes = gen_slopes(n_heads, alibi_bias_max, device=device)
-    alibi_bias = alibi_bias * slopes
-    return alibi_bias.to(dtype=dtype)
-ATTN_CLASS_REGISTRY = {'multihead_attention': MultiheadAttention, 'multiquery_attention': MultiQueryAttention}
\ No newline at end of file
diff --git a/mllm/src/mpt_lora_patch/blocks.py b/mllm/src/mpt_lora_patch/blocks.py
deleted file mode 100644
index 04493aa4c03ef1b14ec539c9af8e9c38e8befc8b..0000000000000000000000000000000000000000
--- a/mllm/src/mpt_lora_patch/blocks.py
+++ /dev/null
@@ -1,41 +0,0 @@
-"""GPT Blocks used for the GPT Model."""
-from typing import Dict, Optional, Tuple
-import torch
-import torch.nn as nn
-from .attention import ATTN_CLASS_REGISTRY
-from .norm import NORM_CLASS_REGISTRY
-
-class MPTMLP(nn.Module):
-
-    def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None):
-        super().__init__()
-        self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
-        self.act = nn.GELU(approximate='none')
-        self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
-        self.down_proj._is_residual = True
-
-    def forward(self, x):
-        return self.down_proj(self.act(self.up_proj(x)))
-
-class MPTBlock(nn.Module):
-
-    def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict={'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', device: Optional[str]=None, **kwargs):
-        del kwargs
-        super().__init__()
-        norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
-        attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
-        self.norm_1 = norm_class(d_model, device=device)
-        self.attn = attn_class(attn_impl=attn_config['attn_impl'], clip_qkv=attn_config['clip_qkv'], qk_ln=attn_config['qk_ln'], softmax_scale=attn_config['softmax_scale'], attn_pdrop=attn_config['attn_pdrop'], d_model=d_model, n_heads=n_heads, device=device)
-        self.norm_2 = norm_class(d_model, device=device)
-        self.ffn = MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, device=device)
-        self.resid_attn_dropout = nn.Dropout(resid_pdrop)
-        self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
-
-    def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
-        a = self.norm_1(x)
-        (b, _, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
-        x = x + self.resid_attn_dropout(b)
-        m = self.norm_2(x)
-        n = self.ffn(m)
-        x = x + self.resid_ffn_dropout(n)
-        return (x, past_key_value)
\ No newline at end of file
diff --git a/mllm/src/mpt_lora_patch/configuration_mpt.py b/mllm/src/mpt_lora_patch/configuration_mpt.py
deleted file mode 100644
index 35d1269cd4b599799d6df7953a8d0c30b33d1e65..0000000000000000000000000000000000000000
--- a/mllm/src/mpt_lora_patch/configuration_mpt.py
+++ /dev/null
@@ -1,118 +0,0 @@
-"""A HuggingFace-style model configuration."""
-from typing import Dict, Optional, Union
-from transformers import PretrainedConfig
-attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}
-init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu'}
-
-class MPTConfig(PretrainedConfig):
-    model_type = 'mpt'
-
-    def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, verbose: int=0, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, **kwargs):
-        """The MPT configuration class.
-
-        Args:
-            d_model (int): The size of the embedding dimension of the model.
-            n_heads (int): The number of attention heads.
-            n_layers (int): The number of layers in the model.
-            expansion_ratio (int): The ratio of the up/down scale in the MLP.
-            max_seq_len (int): The maximum sequence length of the model.
-            vocab_size (int): The size of the vocabulary.
-            resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
-            emb_pdrop (float): The dropout probability for the embedding layer.
-            learned_pos_emb (bool): Whether to use learned positional embeddings
-            attn_config (Dict):  A dictionary used to configure the model's attention module:
-                attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
-                attn_pdrop (float): The dropout probability for the attention layers.
-                attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
-                qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
-                clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
-                    this value.
-                softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
-                    use the default scale of ``1/sqrt(d_keys)``.
-                prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
-                    extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
-                    can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
-                attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
-                    When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
-                    which sub-sequence each token belongs to.
-                    Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
-                alibi (bool): Whether to use the alibi bias instead of position embeddings.
-                alibi_bias_max (int): The maximum value of the alibi bias.
-            init_device (str): The device to use for parameter initialization.
-            logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
-            no_bias (bool): Whether to use bias in all layers.
-            verbose (int): The verbosity level. 0 is silent.
-            embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
-            norm_type (str): choose type of norm to use
-            multiquery_attention (bool): Whether to use multiquery attention implementation.
-            use_cache (bool): Whether or not the model should return the last key/values attentions
-            init_config (Dict): A dictionary used to configure the model initialization:
-                init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
-                    'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
-                    'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
-                init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
-                emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
-                emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
-                    used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
-                init_std (float): The standard deviation of the normal distribution used to initialize the model,
-                    if using the baseline_ parameter initialization scheme.
-                init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
-                fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
-                init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
-                ---
-                See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
-        """
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.expansion_ratio = expansion_ratio
-        self.max_seq_len = max_seq_len
-        self.vocab_size = vocab_size
-        self.resid_pdrop = resid_pdrop
-        self.emb_pdrop = emb_pdrop
-        self.learned_pos_emb = learned_pos_emb
-        self.attn_config = attn_config
-        self.init_device = init_device
-        self.logit_scale = logit_scale
-        self.no_bias = no_bias
-        self.verbose = verbose
-        self.embedding_fraction = embedding_fraction
-        self.norm_type = norm_type
-        self.use_cache = use_cache
-        self.init_config = init_config
-        if 'name' in kwargs:
-            del kwargs['name']
-        if 'loss_fn' in kwargs:
-            del kwargs['loss_fn']
-        super().__init__(**kwargs)
-        self._validate_config()
-
-    def _set_config_defaults(self, config, config_defaults):
-        for (k, v) in config_defaults.items():
-            if k not in config:
-                config[k] = v
-        return config
-
-    def _validate_config(self):
-        self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
-        self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
-        if self.d_model % self.n_heads != 0:
-            raise ValueError('d_model must be divisible by n_heads')
-        if any((prob < 0 or prob > 1 for prob in [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop])):
-            raise ValueError("self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1")
-        if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
-            raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}")
-        if self.attn_config['prefix_lm'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
-            raise NotImplementedError('prefix_lm only implemented with torch and triton attention.')
-        if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
-            raise NotImplementedError('alibi only implemented with torch and triton attention.')
-        if self.attn_config['attn_uses_sequence_id'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
-            raise NotImplementedError('attn_uses_sequence_id only implemented with torch and triton attention.')
-        if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
-            raise ValueError('model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!')
-        if isinstance(self.logit_scale, str) and self.logit_scale != 'inv_sqrt_d_model':
-            raise ValueError(f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
-        if self.init_config.get('name', None) is None:
-            raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
-        if not self.learned_pos_emb and (not self.attn_config['alibi']):
-            raise ValueError(f'Positional information must be provided to the model using either learned_pos_emb or alibi.')
\ No newline at end of file
diff --git a/mllm/src/mpt_lora_patch/export_hf_checkpoint.py b/mllm/src/mpt_lora_patch/export_hf_checkpoint.py
deleted file mode 100644
index 779cffde87e1969d4ff8e3488b5db5178fc98d51..0000000000000000000000000000000000000000
--- a/mllm/src/mpt_lora_patch/export_hf_checkpoint.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import sys
-import torch
-from peft import PeftModel
-from transformers import AutoModelForCausalLM
-
-# Based on https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py
-# Note that this does NOT guard against no-op merges. I would suggest testing the output.
-
-if len(sys.argv) != 4:
-    print("Usage: python export_hf_checkpoint.py <source> <lora> <dest>")
-    exit(1)
-
-source_path = sys.argv[1]
-lora_path = sys.argv[2]
-dest_path = sys.argv[3]
-
-base_model = AutoModelForCausalLM.from_pretrained(
-    source_path,
-    load_in_8bit=False,
-    torch_dtype=torch.float16,
-    device_map={"": "cpu"},
-    trust_remote_code=True,
-)
-
-lora_model = PeftModel.from_pretrained(
-    base_model,
-    lora_path,
-    device_map={"": "cpu"},
-    torch_dtype=torch.float16,
-)
-
-# merge weights - new merging method from peft
-lora_model = lora_model.merge_and_unload()
-lora_model.train(False)
-
-lora_model_sd = lora_model.state_dict()
-deloreanized_sd = {
-    k.replace("base_model.model.", ""): v
-    for k, v in lora_model_sd.items()
-    if "lora" not in k
-}
-
-base_model.save_pretrained(
-    dest_path, state_dict=deloreanized_sd, max_shard_size="400MB"
-)
diff --git a/mllm/src/mpt_lora_patch/hf_prefixlm_converter.py b/mllm/src/mpt_lora_patch/hf_prefixlm_converter.py
deleted file mode 100644
index 8c1a6487202a6400a7116a6bd68b493892ef0d14..0000000000000000000000000000000000000000
--- a/mllm/src/mpt_lora_patch/hf_prefixlm_converter.py
+++ /dev/null
@@ -1,415 +0,0 @@
-"""Converts Huggingface Causal LM to Prefix LM.
-
-Conversion does lightweight surgery on a HuggingFace
-Causal LM to convert it to a Prefix LM.
-
-Prefix LMs accepts a `bidirectional_mask` input in `forward`
-and treat the input prompt as the prefix in `generate`.
-"""
-import math
-import warnings
-from types import MethodType
-from typing import Any, Dict, List, Optional, Tuple, Union
-import torch
-from transformers.models.bloom.modeling_bloom import BaseModelOutputWithPastAndCrossAttentions, BloomForCausalLM, BloomModel, CausalLMOutputWithCrossAttentions, CrossEntropyLoss
-from transformers.models.bloom.modeling_bloom import _expand_mask as _expand_mask_bloom
-from transformers.models.bloom.modeling_bloom import _make_causal_mask as _make_causal_mask_bloom
-from transformers.models.bloom.modeling_bloom import logging
-from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
-from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoForCausalLM
-from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM
-from transformers.models.gptj.modeling_gptj import GPTJForCausalLM
-from transformers.models.opt.modeling_opt import OPTForCausalLM
-from transformers.models.opt.modeling_opt import _expand_mask as _expand_mask_opt
-from transformers.models.opt.modeling_opt import _make_causal_mask as _make_causal_mask_opt
-logger = logging.get_logger(__name__)
-_SUPPORTED_GPT_MODELS = (GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM)
-CAUSAL_GPT_TYPES = Union[GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM]
-
-def _convert_gpt_causal_lm_to_prefix_lm(model: CAUSAL_GPT_TYPES) -> CAUSAL_GPT_TYPES:
-    """Converts a GPT-style Causal LM to a Prefix LM.
-
-    Supported HuggingFace model classes:
-        - `GPT2LMHeadModel`
-        - `GPTNeoForCausalLM`
-        - `GPTNeoXForCausalLM`
-        - `GPTJForCausalLM`
-
-    See `convert_hf_causal_lm_to_prefix_lm` for more details.
-    """
-    if hasattr(model, '_prefix_lm_converted'):
-        return model
-    assert isinstance(model, _SUPPORTED_GPT_MODELS)
-    assert model.config.add_cross_attention == False, 'Only supports GPT-style decoder-only models'
-
-    def _get_attn_modules(model: CAUSAL_GPT_TYPES) -> List[torch.nn.Module]:
-        """Helper that gets a list of the model's attention modules.
-
-        Each module has a `bias` buffer used for causal masking. The Prefix LM
-        conversion adds logic to dynamically manipulate these biases to support
-        Prefix LM attention masking.
-        """
-        attn_modules = []
-        if isinstance(model, GPTNeoXForCausalLM):
-            blocks = model.gpt_neox.layers
-        else:
-            blocks = model.transformer.h
-        for block in blocks:
-            if isinstance(model, GPTNeoForCausalLM):
-                if block.attn.attention_type != 'global':
-                    continue
-                attn_module = block.attn.attention
-            elif isinstance(model, GPTNeoXForCausalLM):
-                attn_module = block.attention
-            else:
-                attn_module = block.attn
-            attn_modules.append(attn_module)
-        return attn_modules
-    setattr(model, '_original_forward', getattr(model, 'forward'))
-    setattr(model, '_original_generate', getattr(model, 'generate'))
-
-    def forward(self: CAUSAL_GPT_TYPES, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]]=None, attention_mask: Optional[torch.FloatTensor]=None, bidirectional_mask: Optional[torch.Tensor]=None, token_type_ids: Optional[torch.LongTensor]=None, position_ids: Optional[torch.LongTensor]=None, head_mask: Optional[torch.FloatTensor]=None, inputs_embeds: Optional[torch.FloatTensor]=None, labels: Optional[torch.LongTensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None):
-        """Wraps original forward to enable PrefixLM attention."""
-
-        def call_og_forward():
-            if isinstance(self, GPTNeoXForCausalLM):
-                return self._original_forward(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
-            else:
-                return self._original_forward(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
-        if bidirectional_mask is None:
-            return call_og_forward()
-        assert isinstance(bidirectional_mask, torch.Tensor)
-        attn_modules = _get_attn_modules(model)
-        (b, s) = bidirectional_mask.shape
-        max_length = attn_modules[0].bias.shape[-1]
-        if s > max_length:
-            raise ValueError(f'bidirectional_mask sequence length (={s}) exceeds the ' + f'max length allowed by the model ({max_length}).')
-        assert s <= max_length
-        if s < max_length:
-            pad = torch.zeros((int(b), int(max_length - s)), dtype=bidirectional_mask.dtype, device=bidirectional_mask.device)
-            bidirectional_mask = torch.cat([bidirectional_mask, pad], dim=1)
-        bidirectional = bidirectional_mask.unsqueeze(1).unsqueeze(1)
-        for attn_module in attn_modules:
-            attn_module.bias.data = torch.logical_or(attn_module.bias.data, bidirectional)
-        output = call_og_forward()
-        for attn_module in attn_modules:
-            attn_module.bias.data = torch.tril(attn_module.bias.data[0, 0])[None, None]
-        return output
-
-    def generate(self: CAUSAL_GPT_TYPES, *args: tuple, **kwargs: Dict[str, Any]):
-        """Wraps original generate to enable PrefixLM attention."""
-        attn_modules = _get_attn_modules(model)
-        for attn_module in attn_modules:
-            attn_module.bias.data[:] = 1
-        output = self._original_generate(*args, **kwargs)
-        for attn_module in attn_modules:
-            attn_module.bias.data = torch.tril(attn_module.bias.data[0, 0])[None, None]
-        return output
-    setattr(model, 'forward', MethodType(forward, model))
-    setattr(model, 'generate', MethodType(generate, model))
-    setattr(model, '_prefix_lm_converted', True)
-    return model
-
-def _convert_bloom_causal_lm_to_prefix_lm(model: BloomForCausalLM) -> BloomForCausalLM:
-    """Converts a BLOOM Causal LM to a Prefix LM.
-
-    Supported HuggingFace model classes:
-        - `BloomForCausalLM`
-
-    See `convert_hf_causal_lm_to_prefix_lm` for more details.
-    """
-    if hasattr(model, '_prefix_lm_converted'):
-        return model
-    assert isinstance(model, BloomForCausalLM)
-    assert model.config.add_cross_attention == False, 'Only supports BLOOM decoder-only models'
-
-    def _prepare_attn_mask(self: BloomModel, attention_mask: torch.Tensor, bidirectional_mask: Optional[torch.Tensor], input_shape: Tuple[int, int], past_key_values_length: int) -> torch.BoolTensor:
-        combined_attention_mask = None
-        device = attention_mask.device
-        (_, src_length) = input_shape
-        if src_length > 1:
-            combined_attention_mask = _make_causal_mask_bloom(input_shape, device=device, past_key_values_length=past_key_values_length)
-            if bidirectional_mask is not None:
-                assert attention_mask.shape == bidirectional_mask.shape
-                expanded_bidirectional_mask = _expand_mask_bloom(bidirectional_mask, tgt_length=src_length)
-                combined_attention_mask = torch.logical_and(combined_attention_mask, expanded_bidirectional_mask)
-        expanded_attn_mask = _expand_mask_bloom(attention_mask, tgt_length=src_length)
-        combined_attention_mask = expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask
-        return combined_attention_mask
-
-    def _build_alibi_tensor(self: BloomModel, batch_size: int, query_length: int, key_length: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
-        num_heads = self.config.n_head
-        closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
-        base = torch.tensor(2 ** (-2 ** (-(math.log2(closest_power_of_2) - 3))), device=device, dtype=torch.float32)
-        powers = torch.arange(1, 1 + closest_power_of_2, device=device, dtype=torch.int32)
-        slopes = torch.pow(base, powers)
-        if closest_power_of_2 != num_heads:
-            extra_base = torch.tensor(2 ** (-2 ** (-(math.log2(2 * closest_power_of_2) - 3))), device=device, dtype=torch.float32)
-            num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
-            extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=device, dtype=torch.int32)
-            slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
-        qa = torch.arange(query_length, device=device, dtype=torch.int32).view(-1, 1)
-        ka = torch.arange(key_length, device=device, dtype=torch.int32).view(1, -1)
-        diffs = qa - ka + key_length - query_length
-        diffs = -diffs.abs()
-        alibi = slopes.view(1, num_heads, 1, 1) * diffs.view(1, 1, query_length, key_length)
-        alibi = alibi.expand(batch_size, -1, -1, -1).reshape(-1, query_length, key_length)
-        return alibi.to(dtype)
-    KeyValueT = Tuple[torch.Tensor, torch.Tensor]
-
-    def forward(self: BloomModel, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[Tuple[KeyValueT, ...]]=None, attention_mask: Optional[torch.Tensor]=None, bidirectional_mask: Optional[torch.Tensor]=None, head_mask: Optional[torch.LongTensor]=None, inputs_embeds: Optional[torch.LongTensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None, **deprecated_arguments) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
-        if deprecated_arguments.pop('position_ids', False) is not False:
-            warnings.warn('`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. ' + 'You can safely ignore passing `position_ids`.', FutureWarning)
-        if len(deprecated_arguments) > 0:
-            raise ValueError(f'Got unexpected arguments: {deprecated_arguments}')
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError('You cannot specify both input_ids and inputs_embeds at the same time')
-        elif input_ids is not None:
-            (batch_size, seq_length) = input_ids.shape
-        elif inputs_embeds is not None:
-            (batch_size, seq_length, _) = inputs_embeds.shape
-        else:
-            raise ValueError('You have to specify either input_ids or inputs_embeds')
-        if past_key_values is None:
-            past_key_values = tuple([None] * len(self.h))
-        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        hidden_states = self.word_embeddings_layernorm(inputs_embeds)
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-        if past_key_values[0] is not None:
-            tmp = past_key_values[0][0]
-            past_key_values_length = tmp.shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-        if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device)
-        else:
-            attention_mask = attention_mask.to(hidden_states.device)
-        alibi = self._build_alibi_tensor(batch_size=batch_size, query_length=seq_length, key_length=seq_length_with_past, dtype=hidden_states.dtype, device=hidden_states.device)
-        causal_mask = self._prepare_attn_mask(attention_mask, bidirectional_mask, input_shape=(batch_size, seq_length), past_key_values_length=past_key_values_length)
-        for (i, (block, layer_past)) in enumerate(zip(self.h, past_key_values)):
-            if output_hidden_states:
-                hst = (hidden_states,)
-                all_hidden_states = all_hidden_states + hst
-            if self.gradient_checkpointing and self.training:
-                if use_cache:
-                    logger.warning('`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...')
-                    use_cache = False
-
-                def create_custom_forward(module):
-
-                    def custom_forward(*inputs):
-                        return module(*inputs, use_cache=use_cache, output_attentions=output_attentions)
-                    return custom_forward
-                outputs = torch.utils.checkpoint.checkpoint(create_custom_forward(block), hidden_states, alibi, causal_mask, head_mask[i])
-            else:
-                outputs = block(hidden_states, layer_past=layer_past, attention_mask=causal_mask, head_mask=head_mask[i], use_cache=use_cache, output_attentions=output_attentions, alibi=alibi)
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-            if output_attentions:
-                oa = (outputs[2 if use_cache else 1],)
-                all_self_attentions = all_self_attentions + oa
-        hidden_states = self.ln_f(hidden_states)
-        if output_hidden_states:
-            hst = (hidden_states,)
-            all_hidden_states = all_hidden_states + hst
-        if not return_dict:
-            return tuple((v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None))
-        return BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=hidden_states, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_self_attentions)
-    setattr(model.transformer, '_prepare_attn_mask', MethodType(_prepare_attn_mask, model.transformer))
-    setattr(model.transformer, '_build_alibi_tensor', MethodType(_build_alibi_tensor, model.transformer))
-    setattr(model.transformer, 'forward', MethodType(forward, model.transformer))
-    KeyValueT = Tuple[torch.Tensor, torch.Tensor]
-
-    def forward(self: BloomForCausalLM, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[Tuple[KeyValueT, ...]]=None, attention_mask: Optional[torch.Tensor]=None, bidirectional_mask: Optional[torch.Tensor]=None, head_mask: Optional[torch.Tensor]=None, inputs_embeds: Optional[torch.Tensor]=None, labels: Optional[torch.Tensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None, **deprecated_arguments) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
-        """Replacement forward method for BloomCausalLM."""
-        if deprecated_arguments.pop('position_ids', False) is not False:
-            warnings.warn('`position_ids` have no functionality in BLOOM and will be removed ' + 'in v5.0.0. You can safely ignore passing `position_ids`.', FutureWarning)
-        if len(deprecated_arguments) > 0:
-            raise ValueError(f'Got unexpected arguments: {deprecated_arguments}')
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        transformer_outputs = self.transformer(input_ids, past_key_values=past_key_values, attention_mask=attention_mask, bidirectional_mask=bidirectional_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
-        hidden_states = transformer_outputs[0]
-        lm_logits = self.lm_head(hidden_states)
-        loss = None
-        if labels is not None:
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            (batch_size, seq_length, vocab_size) = shift_logits.shape
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length))
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return (loss,) + output if loss is not None else output
-        return CausalLMOutputWithCrossAttentions(loss=loss, logits=lm_logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions)
-
-    def prepare_inputs_for_generation(self: BloomForCausalLM, input_ids: torch.LongTensor, past: Optional[torch.Tensor]=None, attention_mask: Optional[torch.Tensor]=None, **kwargs) -> dict:
-        if past:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-            bidirectional_mask = None
-            if past[0][0].shape[0] == input_ids.shape[0]:
-                past = self._convert_to_bloom_cache(past)
-        else:
-            bidirectional_mask = torch.ones_like(input_ids)
-        return {'input_ids': input_ids, 'past_key_values': past, 'use_cache': True, 'attention_mask': attention_mask, 'bidirectional_mask': bidirectional_mask}
-    setattr(model, 'forward', MethodType(forward, model))
-    setattr(model, 'prepare_inputs_for_generation', MethodType(prepare_inputs_for_generation, model))
-    setattr(model, '_prefix_lm_converted', True)
-    return model
-
-def _convert_opt_causal_lm_to_prefix_lm(model: OPTForCausalLM) -> OPTForCausalLM:
-    """Converts an OPT Causal LM to a Prefix LM.
-
-    Supported HuggingFace model classes:
-        - `OPTForCausalLM`
-
-    See `convert_hf_causal_lm_to_prefix_lm` for more details.
-    """
-    if hasattr(model, '_prefix_lm_converted'):
-        return model
-    assert isinstance(model, OPTForCausalLM)
-    assert model.config.add_cross_attention == False, 'Only supports OPT decoder-only models'
-    setattr(model, '_original_forward', getattr(model, 'forward'))
-    setattr(model, '_original_generate', getattr(model, 'generate'))
-    model.model.decoder.bidirectional_mask = None
-
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            if self.bidirectional_mask == 'g':
-                (bsz, src_length) = input_shape
-                combined_attention_mask = torch.zeros((bsz, 1, src_length, src_length + past_key_values_length), dtype=inputs_embeds.dtype, device=inputs_embeds.device)
-            else:
-                combined_attention_mask = _make_causal_mask_opt(input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length).to(inputs_embeds.device)
-                if self.bidirectional_mask is not None:
-                    assert attention_mask.shape == self.bidirectional_mask.shape
-                    expanded_bidirectional_mask = _expand_mask_opt(self.bidirectional_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(inputs_embeds.device)
-                    combined_attention_mask = torch.maximum(expanded_bidirectional_mask, combined_attention_mask)
-        if attention_mask is not None:
-            expanded_attn_mask = _expand_mask_opt(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(inputs_embeds.device)
-            combined_attention_mask = expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
-        return combined_attention_mask
-    setattr(model.model.decoder, '_prepare_decoder_attention_mask', MethodType(_prepare_decoder_attention_mask, model.model.decoder))
-
-    def forward(self: OPTForCausalLM, input_ids: Optional[torch.LongTensor]=None, attention_mask: Optional[torch.Tensor]=None, bidirectional_mask: Optional[torch.ByteTensor]=None, head_mask: Optional[torch.Tensor]=None, past_key_values: Optional[List[torch.FloatTensor]]=None, inputs_embeds: Optional[torch.FloatTensor]=None, labels: Optional[torch.LongTensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None):
-
-        def call_og_forward():
-            return self._original_forward(input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, past_key_values=past_key_values, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
-        if bidirectional_mask is None:
-            return call_og_forward()
-        self.model.decoder.bidirectional_mask = bidirectional_mask
-        try:
-            outputs = call_og_forward()
-        except:
-            self.model.decoder.bidirectional_mask = None
-            raise
-        self.model.decoder.bidirectional_mask = None
-        return outputs
-
-    def generate(self: OPTForCausalLM, *args: tuple, **kwargs: Dict[str, Any]):
-        """Wraps original generate to enable PrefixLM-style attention."""
-        self.model.decoder.bidirectional_mask = 'g'
-        try:
-            output = self._original_generate(*args, **kwargs)
-        except:
-            self.model.decoder.bidirectional_mask = None
-            raise
-        self.model.decoder.bidirectional_mask = None
-        return output
-    setattr(model, 'forward', MethodType(forward, model))
-    setattr(model, 'generate', MethodType(generate, model))
-    setattr(model, '_prefix_lm_converted', True)
-    return model
-_SUPPORTED_HF_MODELS = _SUPPORTED_GPT_MODELS + (BloomForCausalLM, OPTForCausalLM)
-CAUSAL_LM_TYPES = Union[GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM, BloomForCausalLM, OPTForCausalLM]
-
-def convert_hf_causal_lm_to_prefix_lm(model: CAUSAL_LM_TYPES) -> CAUSAL_LM_TYPES:
-    """Converts a HuggingFace Causal LM to a Prefix LM.
-
-    Supported HuggingFace model classes:
-        - `GPT2LMHeadModel`
-        - `GPTNeoForCausalLM`
-        - `GPTNeoXForCausalLM`
-        - `GPTJForCausalLM`
-        - `BloomForCausalLM`
-        - `OPTForCausalLM`
-
-    Conversion to a Prefix LM is done by modifying the `forward` method, and possibly also the
-    `generate` method and/or select underlying methods depending on the model class.
-
-    These changes preserve the model API, but add a new input to `forward`: "bidirectional_mask".
-
-    Notes on training:
-        To actually train the converted model as a Prefix LM, training batches will need to indicate
-        the prefix/target structure by including `bidirectional_mask` as part of the batch inputs.
-
-        **This is not a standard input and requires custom layers either within or after your dataloader.**
-
-        In addition to adding `bidirectional_mask` to the batch, this custom code should modify `labels`
-        such that `batch['labels'][batch['bidirectional_mask'] == 1] == -100`.
-        That is, the prefix portion of the sequence should not generate any loss. Loss should only be
-        generated by the target portion of the sequence.
-
-    Notes on `GPTNeoForCausalLM`:
-        To simplify the implementation, "global" and "local" attention layers are handled differently.
-        For "global" layers, we handle conversion as described above. For "local" layers, which use a
-        causal attention mask within a restricted local window, we do not alter the masking.
-
-    Notes on `forward` method conversion:
-        After conversion, the `forward` method will handle a new input, `bidirectional_mask`,
-        which should be a [batch_size, seq_length] byte tensor, where 1 indicates token positions
-        belonging to the prefix (prefix tokens can attend to one another bidirectionally), and
-        0 indicates token positions belonging to the target.
-
-        The new `forward` method will incorporate `bidirectional_mask` (if supplied) into the existing
-        causal mask, call the original `forward` method, and (if the causal mask is a buffer) reset
-        the causal masks before returning the result.
-
-    Notes on `generate` method conversion:
-        After conversion, the `generate` method will have the same signature but will internally
-        convert all causal masks to be purely bidirectional, call the original `generate` method, and
-        (where appropriate) reset the causal masks before returning the result.
-
-        This works thanks to the logic of the HuggingFace `generate` API, which first encodes the token
-        "prompt" passed to `generate` (which is treated as the prefix) and then sequentially generates
-        each new token. Encodings are cached as generation happens, so all prefix tokens can attend to one
-        another (as expected in a Prefix LM) and generated tokens can only attend to prefix tokens and
-        previously-generated tokens (also as expected in a Prefix LM).
-
-    To preserve the API, the original methods are renamed to `_original_forward` and
-    `_original_generate`, and replaced with new `forward` and `generate` methods that wrap
-    them, respectively. Although implementation details vary by model class.
-    """
-    if isinstance(model, _SUPPORTED_GPT_MODELS):
-        return _convert_gpt_causal_lm_to_prefix_lm(model)
-    elif isinstance(model, BloomForCausalLM):
-        return _convert_bloom_causal_lm_to_prefix_lm(model)
-    elif isinstance(model, OPTForCausalLM):
-        return _convert_opt_causal_lm_to_prefix_lm(model)
-    else:
-        raise TypeError(f'Cannot convert model to Prefix LM. ' + f'Model does not belong to set of supported HF models:' + f'\n{_SUPPORTED_HF_MODELS}')
-
-def add_bidirectional_mask_if_missing(batch: Dict[str, Any]):
-    """Attempts to add bidirectional_mask to batch if missing.
-
-    Raises:
-        KeyError if bidirectional_mask is missing and can't be inferred
-    """
-    if 'bidirectional_mask' not in batch:
-        if batch.get('mode', None) == 'icl_task':
-            batch['bidirectional_mask'] = batch['attention_mask'].clone()
-            for (i, continuation_indices) in enumerate(batch['continuation_indices']):
-                batch['bidirectional_mask'][i, continuation_indices] = 0
-        elif 'labels' in batch and 'attention_mask' in batch:
-            batch['bidirectional_mask'] = torch.logical_and(torch.eq(batch['attention_mask'], 1), torch.eq(batch['labels'], -100)).type_as(batch['attention_mask'])
-        else:
-            raise KeyError('No bidirectional_mask in batch and not sure how to construct one.')
\ No newline at end of file
diff --git a/mllm/src/mpt_lora_patch/meta_init_context.py b/mllm/src/mpt_lora_patch/meta_init_context.py
deleted file mode 100644
index 6cba6fff0fe21fe222c7ab38eae44a9784c0be9c..0000000000000000000000000000000000000000
--- a/mllm/src/mpt_lora_patch/meta_init_context.py
+++ /dev/null
@@ -1,94 +0,0 @@
-from contextlib import contextmanager
-import torch
-import torch.nn as nn
-
-@contextmanager
-def init_empty_weights(include_buffers: bool=False):
-    """Meta initialization context manager.
-
-    A context manager under which models are initialized with all parameters
-    on the meta device, therefore creating an empty model. Useful when just
-    initializing the model would blow the available RAM.
-
-    Args:
-        include_buffers (`bool`, *optional*, defaults to `False`): Whether or
-            not to also put all buffers on the meta device while initializing.
-
-    Example:
-    ```python
-    import torch.nn as nn
-
-    # Initialize a model with 100 billions parameters in no time and without using any RAM.
-    with init_empty_weights():
-        tst = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
-    ```
-
-    <Tip warning={true}>
-
-    Any model created under this context manager has no weights. As such you can't do something like
-    `model.to(some_device)` with it. To load weights inside your empty model, see [`load_checkpoint_and_dispatch`].
-
-    </Tip>
-    """
-    with init_on_device(torch.device('meta'), include_buffers=include_buffers) as f:
-        yield f
-
-@contextmanager
-def init_on_device(device: torch.device, include_buffers: bool=False):
-    """Device initialization context manager.
-
-    A context manager under which models are initialized with all parameters
-    on the specified device.
-
-    Args:
-        device (`torch.device`): Device to initialize all parameters on.
-        include_buffers (`bool`, *optional*, defaults to `False`): Whether or
-            not to also put all buffers on the meta device while initializing.
-
-    Example:
-    ```python
-    import torch.nn as nn
-
-    with init_on_device(device=torch.device("cuda")):
-        tst = nn.Liner(100, 100)  # on `cuda` device
-    ```
-    """
-    old_register_parameter = nn.Module.register_parameter
-    if include_buffers:
-        old_register_buffer = nn.Module.register_buffer
-
-    def register_empty_parameter(module, name, param):
-        old_register_parameter(module, name, param)
-        if param is not None:
-            param_cls = type(module._parameters[name])
-            kwargs = module._parameters[name].__dict__
-            module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
-
-    def register_empty_buffer(module, name, buffer):
-        old_register_buffer(module, name, buffer)
-        if buffer is not None:
-            module._buffers[name] = module._buffers[name].to(device)
-    if include_buffers:
-        tensor_constructors_to_patch = {torch_function_name: getattr(torch, torch_function_name) for torch_function_name in ['empty', 'zeros', 'ones', 'full']}
-    else:
-        tensor_constructors_to_patch = {}
-
-    def patch_tensor_constructor(fn):
-
-        def wrapper(*args, **kwargs):
-            kwargs['device'] = device
-            return fn(*args, **kwargs)
-        return wrapper
-    try:
-        nn.Module.register_parameter = register_empty_parameter
-        if include_buffers:
-            nn.Module.register_buffer = register_empty_buffer
-        for torch_function_name in tensor_constructors_to_patch.keys():
-            setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
-        yield
-    finally:
-        nn.Module.register_parameter = old_register_parameter
-        if include_buffers:
-            nn.Module.register_buffer = old_register_buffer
-        for (torch_function_name, old_torch_function) in tensor_constructors_to_patch.items():
-            setattr(torch, torch_function_name, old_torch_function)
\ No newline at end of file
diff --git a/mllm/src/mpt_lora_patch/modeling_mpt.py b/mllm/src/mpt_lora_patch/modeling_mpt.py
deleted file mode 100644
index be7a4dbf1f4cf041c44dc0abed24e589685485f9..0000000000000000000000000000000000000000
--- a/mllm/src/mpt_lora_patch/modeling_mpt.py
+++ /dev/null
@@ -1,346 +0,0 @@
-"""A simple, flexible implementation of a GPT model.
-
-Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
-"""
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from .attention import attn_bias_shape, build_attn_bias
-from .blocks import MPTBlock
-from .norm import NORM_CLASS_REGISTRY
-from .configuration_mpt import MPTConfig
-from .adapt_tokenizer import AutoTokenizerForMOD, adapt_tokenizer_for_denoising
-from .hf_prefixlm_converter import add_bidirectional_mask_if_missing, convert_hf_causal_lm_to_prefix_lm
-from .meta_init_context import init_empty_weights
-from .param_init_fns import MODEL_INIT_REGISTRY, generic_param_init_fn_
-Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
-
-class MPTPreTrainedModel(PreTrainedModel):
-    config_class = MPTConfig
-    base_model_prefix = 'model'
-    _no_split_modules = ["MPTBlock"]
-    supports_gradient_checkpointing = True
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, MPTModel):
-            module.gradient_checkpointing = value
-
-class MPTModel(MPTPreTrainedModel):
-
-    def __init__(self, config: MPTConfig):
-        config._validate_config()
-        super().__init__(config)
-        self.gradient_checkpointing = False
-        self.attn_impl = config.attn_config['attn_impl']
-        self.prefix_lm = config.attn_config['prefix_lm']
-        self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
-        self.alibi = config.attn_config['alibi']
-        self.alibi_bias_max = config.attn_config['alibi_bias_max']
-        if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
-            norm_options = ' | '.join(NORM_CLASS_REGISTRY.keys())
-            raise NotImplementedError(f'Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).')
-        norm_class = NORM_CLASS_REGISTRY[config.norm_type.lower()]
-        self.embedding_fraction = config.embedding_fraction
-        self.wte = nn.Embedding(config.vocab_size, config.d_model, device=config.init_device)
-        if not self.alibi:
-            self.wpe = nn.Embedding(config.max_seq_len, config.d_model, device=config.init_device)
-        self.emb_drop = nn.Dropout(config.emb_pdrop)
-        self.blocks = nn.ModuleList([MPTBlock(device=config.init_device, **config.to_dict()) for _ in range(config.n_layers)])
-        self.norm_f = norm_class(config.d_model, device=config.init_device)
-        if config.init_device != 'meta':
-            self.apply(self.param_init_fn)
-        self.is_causal = not self.prefix_lm
-        self._attn_bias_initialized = False
-        self.attn_bias = None
-        self.attn_bias_shape = attn_bias_shape(self.attn_impl, config.n_heads, config.max_seq_len, self.alibi, prefix_lm=self.prefix_lm, causal=self.is_causal, use_sequence_id=self.attn_uses_sequence_id)
-        if config.no_bias:
-            for module in self.modules():
-                if hasattr(module, 'bias') and isinstance(module.bias, nn.Parameter):
-                    if config.verbose:
-                        warnings.warn(f'Removing bias ({module.bias}) from {module}.')
-                    module.register_parameter('bias', None)
-        if config.verbose and config.verbose > 2:
-            print(self)
-        if 'verbose' not in self.config.init_config:
-            self.config.init_config['verbose'] = self.config.verbose
-        if self.config.init_config['verbose'] > 1:
-            init_fn_name = self.config.init_config['name']
-            warnings.warn(f'Using {init_fn_name} initialization.')
-
-    def get_input_embeddings(self):
-        return self.wte
-
-    def set_input_embeddings(self, value):
-        self.wte = value
-
-    @torch.no_grad()
-    def _attn_bias(self, device, dtype, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None):
-        if not self._attn_bias_initialized:
-            if self.attn_bias_shape:
-                self.attn_bias = torch.zeros(self.attn_bias_shape, device=device, dtype=dtype)
-                self.attn_bias = build_attn_bias(self.attn_impl, self.attn_bias, self.config.n_heads, self.config.max_seq_len, causal=self.is_causal, alibi=self.alibi, alibi_bias_max=self.alibi_bias_max)
-            self._attn_bias_initialized = True
-        if self.attn_impl == 'flash':
-            return (self.attn_bias, attention_mask)
-        if self.attn_bias is not None:
-            self.attn_bias = self.attn_bias.to(dtype=dtype, device=device)
-        attn_bias = self.attn_bias
-        if self.prefix_lm:
-            assert isinstance(attn_bias, torch.Tensor)
-            assert isinstance(prefix_mask, torch.Tensor)
-            attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask)
-        if self.attn_uses_sequence_id and sequence_id is not None:
-            assert isinstance(attn_bias, torch.Tensor)
-            attn_bias = self._apply_sequence_id(attn_bias, sequence_id)
-        if attention_mask is not None:
-            s_k = attention_mask.shape[-1]
-            if attn_bias is None:
-                attn_bias = torch.zeros((1, 1, 1, s_k), device=device, dtype=dtype)
-            else:
-                attn_bias = attn_bias[:, :, :, -s_k:]
-            if prefix_mask is not None and attention_mask.shape != prefix_mask.shape:
-                raise ValueError(f'attention_mask shape={attention_mask.shape} ' + f'and prefix_mask shape={prefix_mask.shape} are not equal.')
-            min_val = torch.finfo(attn_bias.dtype).min
-            attn_bias = attn_bias.masked_fill(~attention_mask.view(-1, 1, 1, s_k), min_val)
-        return (attn_bias, None)
-
-    def _apply_prefix_mask(self, attn_bias: torch.Tensor, prefix_mask: torch.Tensor):
-        (s_k, s_q) = attn_bias.shape[-2:]
-        if s_k != self.config.max_seq_len or s_q != self.config.max_seq_len:
-            raise ValueError('attn_bias does not match the expected shape. ' + f'The last two dimensions should both be {self.config.max_length} ' + f'but are {s_k} and {s_q}.')
-        seq_len = prefix_mask.shape[-1]
-        if seq_len > self.config.max_seq_len:
-            raise ValueError(f'prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}')
-        attn_bias = attn_bias[..., :seq_len, :seq_len]
-        causal = torch.tril(torch.ones((seq_len, seq_len), dtype=torch.bool, device=prefix_mask.device)).view(1, 1, seq_len, seq_len)
-        prefix = prefix_mask.view(-1, 1, 1, seq_len)
-        cannot_attend = ~torch.logical_or(causal, prefix.bool())
-        min_val = torch.finfo(attn_bias.dtype).min
-        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
-        return attn_bias
-
-    def _apply_sequence_id(self, attn_bias: torch.Tensor, sequence_id: torch.LongTensor):
-        seq_len = sequence_id.shape[-1]
-        if seq_len > self.config.max_seq_len:
-            raise ValueError(f'sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}')
-        attn_bias = attn_bias[..., :seq_len, :seq_len]
-        cannot_attend = torch.logical_not(torch.eq(sequence_id.view(-1, seq_len, 1), sequence_id.view(-1, 1, seq_len))).unsqueeze(1)
-        min_val = torch.finfo(attn_bias.dtype).min
-        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
-        return attn_bias
-
-    def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, inputs_embeds: Optional[torch.FloatTensor] = None):
-        return_dict = return_dict if return_dict is not None else self.config.return_dict
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                use_cache = False
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-
-        if attention_mask is not None:
-            attention_mask = attention_mask.bool()
-        else:
-            attention_mask = torch.ones(
-                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
-            )
-
-        if inputs_embeds is None:
-            tok_emb = self.wte(input_ids)
-        else:
-            tok_emb = inputs_embeds
-
-        if prefix_mask is not None:
-            prefix_mask = prefix_mask.bool()
-        if not return_dict:
-            raise NotImplementedError('return_dict False is not implemented yet for MPT')
-        if output_attentions:
-            raise NotImplementedError('output_attentions is not implemented yet for MPT')
-        #if attention_mask is not None and attention_mask[:, 0].sum() != attention_mask.shape[0] and self.training:
-        #    raise NotImplementedError('MPT does not support training with left padding.')
-        if self.prefix_lm and prefix_mask is None:
-            raise ValueError('prefix_mask is a required argument when MPT is configured with prefix_lm=True.')
-        if self.training:
-            if self.attn_uses_sequence_id and sequence_id is None:
-                raise ValueError('sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True ' + 'and the model is in train mode.')
-            elif self.attn_uses_sequence_id is False and sequence_id is not None:
-                warnings.warn('MPT received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. ' + 'This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True.')
-        S = seq_length
-        assert S <= self.config.max_seq_len, f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
-        if self.alibi:
-            x = tok_emb
-        else:
-            past_position = 0
-            if past_key_values is not None:
-                if len(past_key_values) != self.config.n_layers:
-                    raise ValueError(f'past_key_values must provide a past_key_value for each attention ' + f'layer in the network (len(past_key_values)={len(past_key_values)!r}; self.config.n_layers={self.config.n_layers!r}).')
-                past_position = past_key_values[0][0].size(1)
-            if S + past_position > self.config.max_seq_len:
-                raise ValueError(f'Cannot forward input with past sequence length {past_position} and current sequence length {S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}.')
-            pos = torch.arange(past_position, S + past_position, dtype=torch.long, device=input_ids.device).unsqueeze(0)
-            if attention_mask is not None and not self.training:
-                pos = torch.clamp(pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[:, past_position:], min=0)
-            pos_emb = self.wpe(pos)
-            x = tok_emb + pos_emb
-        if self.embedding_fraction == 1:
-            x = self.emb_drop(x)
-        else:
-            x_shrunk = x * self.embedding_fraction + x.detach() * (1 - self.embedding_fraction)
-            assert isinstance(self.emb_drop, nn.Module)
-            x = self.emb_drop(x_shrunk)
-        (attn_bias, attention_mask) = self._attn_bias(device=x.device, dtype=x.dtype, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id)
-        if use_cache and past_key_values is None:
-            past_key_values = [() for _ in range(self.config.n_layers)]
-
-        all_hidden_states = () if output_hidden_states else None
-        for (b_idx, block) in enumerate(self.blocks):
-            if output_hidden_states:
-                assert all_hidden_states is not None
-                all_hidden_states = all_hidden_states + (x,)
-            past_key_value = past_key_values[b_idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs)
-
-                    return custom_forward
-
-                (x, past_key_value) = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    x,
-                    past_key_value,
-                    attn_bias,
-                    attention_mask,
-                    self.is_causal,
-                )
-            else:
-                (x, past_key_value) = block(x, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=self.is_causal)
-
-            if past_key_values is not None:
-                past_key_values[b_idx] = past_key_value
-        x = self.norm_f(x)
-        return BaseModelOutputWithPast(last_hidden_state=x, past_key_values=past_key_values, hidden_states=all_hidden_states)
-
-    def param_init_fn(self, module):
-        init_fn_name = self.config.init_config['name']
-        MODEL_INIT_REGISTRY[init_fn_name](module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config)
-
-    def fsdp_wrap_fn(self, module):
-        return isinstance(module, MPTBlock)
-
-    def activation_checkpointing_fn(self, module):
-        return isinstance(module, MPTBlock)
-
-class MPTForCausalLM(MPTPreTrainedModel):
-
-    def __init__(self, config: MPTConfig):
-        super().__init__(config)
-        if not config.tie_word_embeddings:
-            raise ValueError('MPTForCausalLM only supports tied word embeddings')
-        self.transformer = MPTModel(config)
-        self.logit_scale = None
-        if config.logit_scale is not None:
-            logit_scale = config.logit_scale
-            if isinstance(logit_scale, str):
-                if logit_scale == 'inv_sqrt_d_model':
-                    logit_scale = 1 / math.sqrt(config.d_model)
-                else:
-                    raise ValueError(f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
-            self.logit_scale = logit_scale
-
-    def get_input_embeddings(self):
-        return self.transformer.wte
-
-    def set_input_embeddings(self, value):
-        self.transformer.wte = value
-
-    def get_output_embeddings(self):
-        return self.transformer.wte
-
-    def set_output_embeddings(self, new_embeddings):
-        self.transformer.wte = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.transformer = decoder
-
-    def get_decoder(self):
-        return self.transformer
-
-    def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, inputs_embeds: Optional[torch.FloatTensor] = None):
-        return_dict = return_dict if return_dict is not None else self.config.return_dict
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache, inputs_embeds=inputs_embeds)
-        logits = F.linear(outputs.last_hidden_state, self.transformer.wte.weight)
-        if self.logit_scale is not None:
-            if self.logit_scale == 0:
-                warnings.warn(f'Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs.')
-            logits *= self.logit_scale
-        loss = None
-        if labels is not None:
-            labels = torch.roll(labels, shifts=-1)
-            labels[:, -1] = -100
-            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1))
-        return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states)
-
-    def param_init_fn(self, module):
-        init_fn_name = self.config.init_config['name']
-        MODEL_INIT_REGISTRY[init_fn_name](module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config)
-
-    def fsdp_wrap_fn(self, module):
-        return isinstance(module, MPTBlock)
-
-    def activation_checkpointing_fn(self, module):
-        return isinstance(module, MPTBlock)
-
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
-        if inputs_embeds is not None:
-            raise NotImplementedError('inputs_embeds is not implemented for MPT yet')
-        attention_mask = kwargs['attention_mask'].bool()
-        if attention_mask[:, -1].sum() != attention_mask.shape[0]:
-            raise NotImplementedError('MPT does not support generation with right padding.')
-        if self.transformer.attn_uses_sequence_id and self.training:
-            sequence_id = torch.zeros_like(input_ids[:1])
-        else:
-            sequence_id = None
-        if past_key_values is not None:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-        if self.transformer.prefix_lm:
-            prefix_mask = torch.ones_like(attention_mask)
-            if kwargs.get('use_cache') == False:
-                raise NotImplementedError('MPT with prefix_lm=True does not support use_cache=False.')
-        else:
-            prefix_mask = None
-        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'prefix_mask': prefix_mask, 'sequence_id': sequence_id, 'past_key_values': past_key_values, 'use_cache': kwargs.get('use_cache', True), "media_locations": kwargs.get('media_locations')}
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        """Used by HuggingFace generate when using beam search with kv-caching.
-
-        See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133
-        for an example in transformers.
-        """
-        reordered_past = []
-        for layer_past in past_key_values:
-            reordered_past += [tuple((past_state.index_select(0, beam_idx) for past_state in layer_past))]
-        return reordered_past
\ No newline at end of file
diff --git a/mllm/src/mpt_lora_patch/norm.py b/mllm/src/mpt_lora_patch/norm.py
deleted file mode 100644
index bec4a4ca3304c2188312387743a49b75015542be..0000000000000000000000000000000000000000
--- a/mllm/src/mpt_lora_patch/norm.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import torch
-
-def _cast_if_autocast_enabled(tensor):
-    if torch.is_autocast_enabled():
-        if tensor.device.type == 'cuda':
-            dtype = torch.get_autocast_gpu_dtype()
-        elif tensor.device.type == 'cpu':
-            dtype = torch.get_autocast_cpu_dtype()
-        else:
-            raise NotImplementedError()
-        return tensor.to(dtype=dtype)
-    return tensor
-
-class LPLayerNorm(torch.nn.LayerNorm):
-
-    def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None):
-        super().__init__(normalized_shape=normalized_shape, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
-
-    def forward(self, x):
-        module_device = x.device
-        downcast_x = _cast_if_autocast_enabled(x)
-        downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
-        downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
-        with torch.autocast(enabled=False, device_type=module_device.type):
-            return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
-
-def rms_norm(x, weight=None, eps=1e-05):
-    output = x / torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
-    if weight is not None:
-        return output * weight
-    return output
-
-class RMSNorm(torch.nn.Module):
-
-    def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
-        super().__init__()
-        self.eps = eps
-        if weight:
-            self.weight = torch.nn.Parameter(torch.ones(normalized_shape, dtype=dtype, device=device))
-        else:
-            self.register_parameter('weight', None)
-
-    def forward(self, x):
-        return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype)
-
-class LPRMSNorm(RMSNorm):
-
-    def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
-        super().__init__(normalized_shape=normalized_shape, eps=eps, weight=weight, dtype=dtype, device=device)
-
-    def forward(self, x):
-        downcast_x = _cast_if_autocast_enabled(x)
-        downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
-        with torch.autocast(enabled=False, device_type=x.device.type):
-            return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
-NORM_CLASS_REGISTRY = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}
\ No newline at end of file
diff --git a/mllm/src/mpt_lora_patch/param_init_fns.py b/mllm/src/mpt_lora_patch/param_init_fns.py
deleted file mode 100644
index 418b83ca2363288046f4b48b1d706c5607341fb5..0000000000000000000000000000000000000000
--- a/mllm/src/mpt_lora_patch/param_init_fns.py
+++ /dev/null
@@ -1,181 +0,0 @@
-import math
-import warnings
-from collections.abc import Sequence
-from functools import partial
-from typing import Optional, Tuple, Union
-import torch
-from torch import nn
-from .norm import NORM_CLASS_REGISTRY
-
-def torch_default_param_init_fn_(module: nn.Module, verbose: int=0, **kwargs):
-    del kwargs
-    if verbose > 1:
-        warnings.warn(f"Initializing network using module's reset_parameters attribute")
-    if hasattr(module, 'reset_parameters'):
-        module.reset_parameters()
-
-def fused_init_helper_(module: nn.Module, init_fn_):
-    _fused = getattr(module, '_fused', None)
-    if _fused is None:
-        raise RuntimeError(f'Internal logic error')
-    (dim, splits) = _fused
-    splits = (0, *splits, module.weight.size(dim))
-    for (s, e) in zip(splits[:-1], splits[1:]):
-        slice_indices = [slice(None)] * module.weight.ndim
-        slice_indices[dim] = slice(s, e)
-        init_fn_(module.weight[slice_indices])
-
-def generic_param_init_fn_(module: nn.Module, init_fn_, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
-    del kwargs
-    if verbose > 1:
-        warnings.warn(f'If model has bias parameters they are initialized to 0.')
-    init_div_is_residual = init_div_is_residual
-    if init_div_is_residual is False:
-        div_is_residual = 1.0
-    elif init_div_is_residual is True:
-        div_is_residual = math.sqrt(2 * n_layers)
-    elif isinstance(init_div_is_residual, float) or isinstance(init_div_is_residual, int):
-        div_is_residual = init_div_is_residual
-    elif isinstance(init_div_is_residual, str) and init_div_is_residual.isnumeric():
-        div_is_residual = float(init_div_is_residual)
-    else:
-        div_is_residual = 1.0
-        raise ValueError(f'Expected init_div_is_residual to be boolean or numeric, got {init_div_is_residual}')
-    if init_div_is_residual is not False:
-        if verbose > 1:
-            warnings.warn(f'Initializing _is_residual layers then dividing them by {div_is_residual:.3f}. ' + f'Set `init_div_is_residual: false` in init config to disable this.')
-    if isinstance(module, nn.Linear):
-        if hasattr(module, '_fused'):
-            fused_init_helper_(module, init_fn_)
-        else:
-            init_fn_(module.weight)
-        if module.bias is not None:
-            torch.nn.init.zeros_(module.bias)
-        if init_div_is_residual is not False and getattr(module, '_is_residual', False):
-            with torch.no_grad():
-                module.weight.div_(div_is_residual)
-    elif isinstance(module, nn.Embedding):
-        if emb_init_std is not None:
-            std = emb_init_std
-            if std == 0:
-                warnings.warn(f'Embedding layer initialized to 0.')
-            emb_init_fn_ = partial(torch.nn.init.normal_, mean=0.0, std=std)
-            if verbose > 1:
-                warnings.warn(f'Embedding layer initialized using normal distribution with mean=0 and std={std!r}.')
-        elif emb_init_uniform_lim is not None:
-            lim = emb_init_uniform_lim
-            if isinstance(lim, Sequence):
-                if len(lim) > 2:
-                    raise ValueError(f'Uniform init requires a min and a max limit. User input: {lim}.')
-                if lim[0] == lim[1]:
-                    warnings.warn(f'Embedding layer initialized to {lim[0]}.')
-            else:
-                if lim == 0:
-                    warnings.warn(f'Embedding layer initialized to 0.')
-                lim = [-lim, lim]
-            (a, b) = lim
-            emb_init_fn_ = partial(torch.nn.init.uniform_, a=a, b=b)
-            if verbose > 1:
-                warnings.warn(f'Embedding layer initialized using uniform distribution in range {lim}.')
-        else:
-            emb_init_fn_ = init_fn_
-        emb_init_fn_(module.weight)
-    elif isinstance(module, tuple(set(NORM_CLASS_REGISTRY.values()))):
-        if verbose > 1:
-            warnings.warn(f'Norm weights are set to 1. If norm layer has a bias it is initialized to 0.')
-        if hasattr(module, 'weight') and module.weight is not None:
-            torch.nn.init.ones_(module.weight)
-        if hasattr(module, 'bias') and module.bias is not None:
-            torch.nn.init.zeros_(module.bias)
-    elif isinstance(module, nn.MultiheadAttention):
-        if module._qkv_same_embed_dim:
-            assert module.in_proj_weight is not None
-            assert module.q_proj_weight is None and module.k_proj_weight is None and (module.v_proj_weight is None)
-            assert d_model is not None
-            _d = d_model
-            splits = (0, _d, 2 * _d, 3 * _d)
-            for (s, e) in zip(splits[:-1], splits[1:]):
-                init_fn_(module.in_proj_weight[s:e])
-        else:
-            assert module.q_proj_weight is not None and module.k_proj_weight is not None and (module.v_proj_weight is not None)
-            assert module.in_proj_weight is None
-            init_fn_(module.q_proj_weight)
-            init_fn_(module.k_proj_weight)
-            init_fn_(module.v_proj_weight)
-        if module.in_proj_bias is not None:
-            torch.nn.init.zeros_(module.in_proj_bias)
-        if module.bias_k is not None:
-            torch.nn.init.zeros_(module.bias_k)
-        if module.bias_v is not None:
-            torch.nn.init.zeros_(module.bias_v)
-        init_fn_(module.out_proj.weight)
-        if init_div_is_residual is not False and getattr(module.out_proj, '_is_residual', False):
-            with torch.no_grad():
-                module.out_proj.weight.div_(div_is_residual)
-        if module.out_proj.bias is not None:
-            torch.nn.init.zeros_(module.out_proj.bias)
-    else:
-        for _ in module.parameters(recurse=False):
-            raise NotImplementedError(f'{module.__class__.__name__} parameters are not initialized by param_init_fn.')
-
-def _normal_init_(std, mean=0.0):
-    return partial(torch.nn.init.normal_, mean=mean, std=std)
-
-def _normal_param_init_fn_(module: nn.Module, std: float, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
-    del kwargs
-    init_fn_ = _normal_init_(std=std)
-    if verbose > 1:
-        warnings.warn(f'Using torch.nn.init.normal_ init fn mean=0.0, std={std}')
-    generic_param_init_fn_(module=module, init_fn_=init_fn_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
-
-def baseline_param_init_fn_(module: nn.Module, init_std: float, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
-    del kwargs
-    if init_std is None:
-        raise ValueError("You must set model.init_config['init_std'] to a float value to use the default initialization scheme.")
-    _normal_param_init_fn_(module=module, std=init_std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
-
-def small_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
-    del kwargs
-    std = math.sqrt(2 / (5 * d_model))
-    _normal_param_init_fn_(module=module, std=std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
-
-def neox_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
-    """From section 2.3.1 of GPT-NeoX-20B:
-
-    An Open-Source AutoregressiveLanguage Model — Black et. al. (2022)
-    see https://github.com/EleutherAI/gpt-neox/blob/9610391ab319403cef079b438edd016a2443af54/megatron/model/init_functions.py#L151
-    and https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/transformer.py
-    """
-    del kwargs
-    residual_div = n_layers / math.sqrt(10)
-    if verbose > 1:
-        warnings.warn(f'setting init_div_is_residual to {residual_div}')
-    small_param_init_fn_(module=module, d_model=d_model, n_layers=n_layers, init_div_is_residual=residual_div, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
-
-def kaiming_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', verbose: int=0, **kwargs):
-    del kwargs
-    if verbose > 1:
-        warnings.warn(f'Using nn.init.kaiming_uniform_ init fn with parameters: ' + f'a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}')
-    kaiming_uniform_ = partial(nn.init.kaiming_uniform_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
-    generic_param_init_fn_(module=module, init_fn_=kaiming_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
-
-def kaiming_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', verbose: int=0, **kwargs):
-    del kwargs
-    if verbose > 1:
-        warnings.warn(f'Using nn.init.kaiming_normal_ init fn with parameters: ' + f'a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}')
-    kaiming_normal_ = partial(torch.nn.init.kaiming_normal_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
-    generic_param_init_fn_(module=module, init_fn_=kaiming_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
-
-def xavier_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, verbose: int=0, **kwargs):
-    del kwargs
-    xavier_uniform_ = partial(torch.nn.init.xavier_uniform_, gain=init_gain)
-    if verbose > 1:
-        warnings.warn(f'Using torch.nn.init.xavier_uniform_ init fn with parameters: ' + f'gain={init_gain}')
-    generic_param_init_fn_(module=module, init_fn_=xavier_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
-
-def xavier_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, verbose: int=0, **kwargs):
-    xavier_normal_ = partial(torch.nn.init.xavier_normal_, gain=init_gain)
-    if verbose > 1:
-        warnings.warn(f'Using torch.nn.init.xavier_normal_ init fn with parameters: ' + f'gain={init_gain}')
-    generic_param_init_fn_(module=module, init_fn_=xavier_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
-MODEL_INIT_REGISTRY = {'default_': torch_default_param_init_fn_, 'baseline_': baseline_param_init_fn_, 'kaiming_uniform_': kaiming_uniform_param_init_fn_, 'kaiming_normal_': kaiming_normal_param_init_fn_, 'neox_init_': neox_param_init_fn_, 'small_init_': small_param_init_fn_, 'xavier_uniform_': xavier_uniform_param_init_fn_, 'xavier_normal_': xavier_normal_param_init_fn_}
\ No newline at end of file
diff --git a/mllm/src/utils.py b/mllm/src/utils.py
deleted file mode 100644
index 7895264638c7f52660e01436de00cc2bc0e52a89..0000000000000000000000000000000000000000
--- a/mllm/src/utils.py
+++ /dev/null
@@ -1,48 +0,0 @@
-def extend_instance(obj, mixin):
-    """Apply mixins to a class instance after creation"""
-    base_cls = obj.__class__
-    base_cls_name = obj.__class__.__name__
-    obj.__class__ = type(
-        base_cls_name, (mixin, base_cls), {}
-    )  # mixin needs to go first for our forward() logic to work
-
-
-def getattr_recursive(obj, att):
-    """
-    Return nested attribute of obj
-    Example: getattr_recursive(obj, 'a.b.c') is equivalent to obj.a.b.c
-    """
-    if att == "":
-        return obj
-    i = att.find(".")
-    if i < 0:
-        return getattr(obj, att)
-    else:
-        return getattr_recursive(getattr(obj, att[:i]), att[i + 1 :])
-
-
-def setattr_recursive(obj, att, val):
-    """
-    Set nested attribute of obj
-    Example: setattr_recursive(obj, 'a.b.c', val) is equivalent to obj.a.b.c = val
-    """
-    if "." in att:
-        obj = getattr_recursive(obj, ".".join(att.split(".")[:-1]))
-    setattr(obj, att.split(".")[-1], val)
-
-
-def apply_with_stopping_condition(
-    module, apply_fn, apply_condition=None, stopping_condition=None, **other_args
-):
-    if stopping_condition(module):
-        return
-    if apply_condition(module):
-        apply_fn(module, **other_args)
-    for child in module.children():
-        apply_with_stopping_condition(
-            child,
-            apply_fn,
-            apply_condition=apply_condition,
-            stopping_condition=stopping_condition,
-            **other_args
-        )