gray311 commited on Jan 12, 2024

Commit

8443bea

verified ·

1 Parent(s): 86caf27

Delete mllm

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

mllm/flamingo/__init__.py +0 -48
mllm/flamingo/config.json +0 -21
mllm/flamingo/configuration_flamingo.py +0 -100
mllm/flamingo/converting_flamingo_to_bf16.py +0 -30
mllm/flamingo/converting_flamingo_to_hf.py +0 -61
mllm/flamingo/converting_flamingo_to_lora.py +0 -68
mllm/flamingo/falcon/__init__.py +0 -0
mllm/flamingo/falcon/__pycache__/__init__.cpython-39.pyc +0 -0
mllm/flamingo/falcon/__pycache__/configuration_RW.cpython-39.pyc +0 -0
mllm/flamingo/falcon/__pycache__/modelling_RW.cpython-39.pyc +0 -0
mllm/flamingo/falcon/configuration_RW.py +0 -79
mllm/flamingo/falcon/modelling_RW.py +0 -1064
mllm/flamingo/flamingo-falcon-7B.json +0 -112
mllm/flamingo/flamingo-llama2-chat-13B.json +0 -114
mllm/flamingo/flamingo-llama2-chat-7B.json +0 -115
mllm/flamingo/flamingo-mpt-1B-redpajama.json +0 -131
mllm/flamingo/flamingo-mpt-30B-bf16.json +0 -195
mllm/flamingo/flamingo-mpt-30B.json +0 -195
mllm/flamingo/flamingo-mpt-7B.json +0 -195
mllm/flamingo/flamingo-vicuna-33B-v1.3.json +0 -111
mllm/flamingo/flamingo-vicuna-7B-v1.3.json +0 -111
mllm/flamingo/injecting_falcon_into_flamingo.py +0 -49
mllm/flamingo/injecting_llama2_into_flamingo.py +0 -95
mllm/flamingo/injecting_mpt-1B-redpajama_into_flamingo.py +0 -97
mllm/flamingo/injecting_mpt_into_flamingo.py +0 -109
mllm/flamingo/injecting_vicuna_into_flamingo.py +0 -100
mllm/flamingo/modeling_flamingo.py +0 -966
mllm/flamingo/mpt/__init__.py +0 -0
mllm/flamingo/mpt/__pycache__/__init__.cpython-39.pyc +0 -0
mllm/flamingo/mpt/__pycache__/attention.cpython-39.pyc +0 -0
mllm/flamingo/mpt/__pycache__/blocks.cpython-39.pyc +0 -0
mllm/flamingo/mpt/__pycache__/configuration_mpt.cpython-39.pyc +0 -0
mllm/flamingo/mpt/__pycache__/custom_embedding.cpython-39.pyc +0 -0
mllm/flamingo/mpt/__pycache__/flash_attn_triton.cpython-39.pyc +0 -0
mllm/flamingo/mpt/__pycache__/modeling_mpt.cpython-39.pyc +0 -0
mllm/flamingo/mpt/__pycache__/norm.cpython-39.pyc +0 -0
mllm/flamingo/mpt/__pycache__/param_init_fns.cpython-39.pyc +0 -0
mllm/flamingo/mpt/adapt_tokenizer.py +0 -44
mllm/flamingo/mpt/attention.py +0 -450
mllm/flamingo/mpt/blocks.py +0 -82
mllm/flamingo/mpt/configuration_mpt.py +0 -161
mllm/flamingo/mpt/custom_embedding.py +0 -11
mllm/flamingo/mpt/flash_attn_triton.py +0 -841
mllm/flamingo/mpt/hf_prefixlm_converter.py +0 -575
mllm/flamingo/mpt/meta_init_context.py +0 -98
mllm/flamingo/mpt/modeling_mpt.py +0 -496
mllm/flamingo/mpt/norm.py +0 -60
mllm/flamingo/mpt/param_init_fns.py +0 -369
mllm/flamingo/mpt_redpajama/__init__.py +0 -0
mllm/flamingo/mpt_redpajama/__pycache__/__init__.cpython-39.pyc +0 -0

mllm/flamingo/__init__.py DELETED Viewed

@@ -1,48 +0,0 @@
-from typing import TYPE_CHECKING
-from transformers.utils import (
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    is_torch_available,
-)
-_import_structure = {
-    "configuration_flamingo": [
-        "FlamingoConfig",
-    ],
-}
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_flamingo"] = [
-        "FlamingoModel",
-        "FlamingoPreTrainedModel",
-        "FlamingoForConditionalGeneration",
-    ]
-if TYPE_CHECKING:
-    from .configuration_flamingo import FlamingoConfig
-    # from .processing_flamingo import FlamingoProcessor
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_flamingo import (
-            FlamingoForConditionalGeneration,
-            FlamingoModel,
-            FlamingoPreTrainedModel,
-        )
-else:
-    import sys
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

mllm/flamingo/config.json DELETED Viewed

@@ -1,21 +0,0 @@
-{
-    "model_type": "flamingo",
-    "cross_attn_every_n_layers": 4,
-    "tie_word_embeddings": false,
-    "use_media_placement_augmentation": true,
-    "only_attend_previous": true,
-    "text_config": {
-        "_name_or_path": "luodian/llama-7b-hf",
-        "model_type": "llama"
-    },
-    "vision_config": {
-        "_name_or_path": "openai/clip-vit-large-patch14",
-        "model_type": "clip_vision_model",
-        "hidden_size": 1024,
-        "intermediate_size": 4096,
-        "num_attention_heads": 16,
-        "num_hidden_layers": 24,
-        "image_size": 224,
-        "patch_size": 14
-    }
-}

mllm/flamingo/configuration_flamingo.py DELETED Viewed

@@ -1,100 +0,0 @@
-import copy
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-from transformers.models.auto import CONFIG_MAPPING
-from transformers.models.clip import CLIPVisionConfig
-import sys
-from .falcon.configuration_RW import RWConfig
-from .mpt.configuration_mpt import MPTConfig
-from .mpt_redpajama.configuration_mosaic_gpt import MosaicGPTConfig
-logger = logging.get_logger(__name__)
-class FlamingoConfig(PretrainedConfig):
-    r"""
-    [`FlamingoConfig`] is the configuration class to store the configuration of a [`FlamingoForConditionalGeneration`]. It is
-    used to instantiate a Flamingo model according to the specified arguments, defining the vision model and language model configs. Instantiating a configuration with the defaults will yield a similar configuration to
-    that of the Flamingo architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        vision_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`PretrainedConfig`].
-        text_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
-        cross_attn_every_n_layers (`int`, *optional*, defaults to 4):
-            The number of cross-attention layers adding after each transformer layer.
-        kwargs (*optional*):
-            Dictionary of keyword arguments.
-    Example:
-    ```python
-    >>> from transformers import (
-    ...     PretrainedConfig,
-    ...     OPTConfig,
-    ...     FlamingoConfig,
-    ...     FlamingoForConditionalGeneration,
-    ... )
-    >>> # Initializing a FlamingoConfig with Salesforce/Flamingo-opt-2.7b style configuration
-    >>> configuration = FlamingoConfig()
-    >>> # Initializing a FlamingoForConditionalGeneration (with random weights) from the Salesforce/Flamingo-opt-2.7b style configuration
-    >>> model = FlamingoForConditionalGeneration(configuration)
-    ```"""
-    model_type = "flamingo"
-    is_composition = True
-    def __init__(self, vision_config=None, text_config=None, cross_attn_every_n_layers: int = 4, use_media_placement_augmentation: bool = True, **kwargs):
-        super().__init__(**kwargs)
-        if vision_config is None:
-            vision_config = {}
-            logger.info("vision_config is None. initializing the vision config with default values.")
-        if text_config is None:
-            text_config = {}
-            logger.info("text_config is None. Initializing the text config with default values.")
-        self.vision_config = CLIPVisionConfig(**vision_config)
-        if "architectures" in text_config.keys() and text_config["architectures"] != None:
-            if text_config["architectures"][0] == "MPTForCausalLM":
-                self.text_config = MPTConfig(**text_config)
-            elif text_config["architectures"][0] == "MosaicGPT":
-                self.text_config = MosaicGPTConfig(**text_config)
-            elif text_config["architectures"][0] == "RWForCausalLM":
-                self.text_config = RWConfig(**text_config)
-            elif text_config["architectures"][0] == "LlamaForCausalLM":
-                self.text_config = CONFIG_MAPPING[text_config.pop("model_type")](**text_config)
-            else:
-                import pdb
-                pdb.set_trace()
-        else:
-            self.text_config = CONFIG_MAPPING[text_config.pop("model_type")](**text_config)
-        self.cross_attn_every_n_layers = cross_attn_every_n_layers
-        self.use_media_placement_augmentation = use_media_placement_augmentation
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["vision_config"] = self.vision_config.to_dict()
-        output["text_config"] = self.text_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        output["cross_attn_every_n_layers"] = self.cross_attn_every_n_layers
-        output["use_media_placement_augmentation"] = self.use_media_placement_augmentation
-        return output

mllm/flamingo/converting_flamingo_to_bf16.py DELETED Viewed

@@ -1,30 +0,0 @@
-import argparse
-import os
-import torch
-from .configuration_flamingo import FlamingoConfig
-from .modeling_flamingo import FlamingoForConditionalGeneration
-parser = argparse.ArgumentParser(description="Load model with precision")
-parser.add_argument("--load_bit", type=str, choices=["fp16", "bf16"], required=True, help="Choose either 'fp16' or 'bf16'")
-parser.add_argument("--pretrained_model_path", type=str, default="/home/luodian/projects/checkpoints/flamingo-mpt-7B-instruct-init", required=True)
-parser.add_argument("--saved_model_path", type=str, default="/home/luodian/projects/checkpoints/flamingo-mpt-7B-instruct-init", required=True)
-args = parser.parse_args()
-load_bit = args.load_bit
-pretrained_model_path = args.pretrained_model_path
-if load_bit == "fp16":
-    precision = {"torch_dtype": torch.float16}
-elif load_bit == "bf16":
-    precision = {"torch_dtype": torch.bfloat16}
-root_dir = os.environ["AZP"]
-print(root_dir)
-device_id = "cpu"
-model = FlamingoForConditionalGeneration.from_pretrained(pretrained_model_path, device_map={"": device_id}, **precision)
-# save model to same folder
-checkpoint_path = pretrained_model_path + f"-{load_bit}"
-model.save_pretrained(checkpoint_path, max_shard_size="10GB")

mllm/flamingo/converting_flamingo_to_hf.py DELETED Viewed

@@ -1,61 +0,0 @@
-"""convert from otter pt to otter hf. Will remove after we use otter hf model to train.
-"""
-import re
-import argparse
-import os
-import torch
-import torch.nn as nn
-from transformers import CLIPVisionModel, LlamaForCausalLM, LlamaTokenizer
-import sys
-from modeling_flamingo import FlamingoForConditionalGeneration
-from configuration_flamingo import FlamingoConfig
-@torch.no_grad()
-def dump_hf_model(pretrained_model_path: str, old_ckpt_path: str, new_folder_path: str) -> None:
-    old_ckpt = torch.load(old_ckpt_path, map_location="cpu")
-    if old_ckpt.get("model_state_dict", None) is not None:
-        old_ckpt = old_ckpt["model_state_dict"]
-    new_ckpt = old_ckpt
-    folder_path = os.path.dirname(old_ckpt_path)
-    # config_path = os.path.join(folder_path, "config.json") if os.path.exists(os.path.join(folder_path, "config.json")) else "flamingo/config.json"
-    model = FlamingoForConditionalGeneration.from_pretrained(
-        args.pretrained_model_path,
-        device_map="auto",
-    )
-    _ = model.load_state_dict(new_ckpt, strict=False)
-    print(f"Saving HF model to {new_folder_path}")
-    model.save_pretrained(new_folder_path)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--old_ckpt_path",
-        "-old",
-        type=str,
-        required=True,
-        help="Path to the pt checkpoint",
-    )
-    parser.add_argument(
-        "--new_hf_path",
-        "-new",
-        type=str,
-        required=True,
-        help="Path to the hf folder",
-    )
-    parser.add_argument(
-        "--pretrained_model_path",
-        "-pretrained",
-        type=str,
-        required=True,
-        help="Path to the pretrained model folder",
-    )
-    args = parser.parse_args()
-    if not os.path.exists(os.path.dirname(args.new_hf_path)):
-        os.makedirs(os.path.dirname(args.new_hf_path))
-    dump_hf_model(args.pretrained_model_path, args.old_ckpt_path, args.new_hf_path)

mllm/flamingo/converting_flamingo_to_lora.py DELETED Viewed

@@ -1,68 +0,0 @@
-import argparse
-import torch
-import sys
-from .modeling_flamingo import FlamingoForConditionalGeneration
-from peft import get_peft_model, LoraConfig, TaskType
-MODEL_CLASSES = {
-    "LlamaForCausalLM": "llama",
-    "OPTForCausalLM": "opt",
-    "GPTJForCausalLM": "gptj",
-    "GPTNeoXForCausalLM": "gpt_neox",
-    "MPTForCausalLM": "mpt",
-}
-# Define argument parser
-parser = argparse.ArgumentParser(description="Load a model with specified precision and save it to a specified path.")
-# Add arguments
-parser.add_argument(
-    "--checkpoint_path",
-    type=str,
-    help="Path to the pre-trained model checkpoint.",
-    default="",
-)
-parser.add_argument(
-    "--save_path",
-    type=str,
-    default="",
-    help="Path to the converted model checkpoint.",
-)
-# Parse the input arguments
-args = parser.parse_args()
-load_bit = "bf16"
-if load_bit == "fp16":
-    precision = {"torch_dtype": torch.float16}
-elif load_bit == "bf16":
-    precision = {"torch_dtype": torch.bfloat16}
-# Load the model
-model = FlamingoForConditionalGeneration.from_pretrained(args.checkpoint_path, device_map="auto", **precision)
-# adding lora
-standard_modules = ["q_proj", "v_proj"]
-lang_encoder_short_name = MODEL_CLASSES[model.config.text_config.architectures[0]]
-model_to_lora_modules = {
-    "llama": standard_modules,
-    "opt": standard_modules,
-    "gptj": standard_modules,
-    "gpt_neox": ["query_key_value"],
-    "mpt": ["Wqkv"],
-}
-lora_config = LoraConfig(
-    r=16,
-    lora_alpha=32,
-    lora_dropout=0.05,
-    task_type=TaskType.CAUSAL_LM,
-    target_modules=model_to_lora_modules[lang_encoder_short_name],
-)
-model.config.update({"lora_config": {"r": 16, "lora_alpha": 32, "lora_dropout": 0.05}})
-model.lang_encoder = get_peft_model(model.lang_encoder, lora_config)
-model.lang_encoder.print_trainable_parameters()
-# Save the model
-checkpoint_path = args.save_path
-FlamingoForConditionalGeneration.save_pretrained(model, checkpoint_path)

mllm/flamingo/falcon/__init__.py DELETED Viewed

File without changes

mllm/flamingo/falcon/__pycache__/__init__.cpython-39.pyc DELETED Viewed

Binary file (209 Bytes)

mllm/flamingo/falcon/__pycache__/configuration_RW.cpython-39.pyc DELETED Viewed

Binary file (1.86 kB)

mllm/flamingo/falcon/__pycache__/modelling_RW.cpython-39.pyc DELETED Viewed

Binary file (28.5 kB)

mllm/flamingo/falcon/configuration_RW.py DELETED Viewed

@@ -1,79 +0,0 @@
-# coding=utf-8
-# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Bloom configuration"""
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-class RWConfig(PretrainedConfig):
-    model_type = "RefinedWebModel"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    attribute_map = {
-        "num_hidden_layers": "n_layer",
-        "num_attention_heads": "n_head",
-    }
-    def __init__(
-        self,
-        vocab_size=250880,
-        hidden_size=64,
-        n_layer=2,
-        n_head=8,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-        use_cache=True,
-        bos_token_id=1,
-        eos_token_id=2,
-        apply_residual_connection_post_layernorm=False,
-        hidden_dropout=0.0,
-        attention_dropout=0.0,
-        multi_query=False,
-        alibi=False,
-        bias=False,
-        parallel_attn=False,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        # Backward compatibility with n_embed kwarg
-        n_embed = kwargs.pop("n_embed", None)
-        self.hidden_size = hidden_size if n_embed is None else n_embed
-        self.n_layer = n_layer
-        self.n_head = n_head
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.use_cache = use_cache
-        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
-        self.hidden_dropout = hidden_dropout
-        self.attention_dropout = attention_dropout
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-        self.multi_query = multi_query
-        self.alibi = alibi
-        self.bias = bias
-        self.parallel_attn = parallel_attn
-        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
-    @property
-    def head_dim(self):
-        return self.hidden_size // self.n_head
-    @property
-    def rotary(self):
-        return not self.alibi

mllm/flamingo/falcon/modelling_RW.py DELETED Viewed

@@ -1,1064 +0,0 @@
-# port of models described in RW
-# We use the bloom model as a starting point for these model.
-# Please refer to the bloom models for usage instructions.
-import math
-import warnings
-from typing import Optional, Tuple, Union
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
-from torch.nn import functional as F
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    CausalLMOutputWithCrossAttentions,
-    QuestionAnsweringModelOutput,
-    SequenceClassifierOutputWithPast,
-    TokenClassifierOutput,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging
-from .configuration_RW import RWConfig
-logger = logging.get_logger(__name__)
-# NOTE(Hesslow): Unfortunately we did not fuse matmul and bias during training, this means that there's one additional quantization to bfloat16 between the operations.
-# In order not to degrade the quality of our HF-port, we keep these characteristics in the final model.
-class Linear(nn.Linear):
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        ret = input @ self.weight.T
-        if self.bias is None:
-            return ret
-        else:
-            return ret + self.bias
-from einops import rearrange
-# rotary pos emb helpers (torch.jit.script does not seem to support staticmethod...)
-def rotate_half(x):
-    x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=x1.ndim - 1)  # dim=-1 triggers a bug in torch < 1.8.0
-class RotaryEmbedding(torch.nn.Module):
-    """Implementation of RotaryEmbedding from GPT-NeoX.
-    This implementation is design to operate on queries and keys that are compatible with
-    [batch_size, n_heads_per_partition, seq_len, head_dim] (e.g. MinGPTAttention format).
-    """
-    def __init__(
-        self,
-        head_dim: int,
-        base=10000,
-    ):
-        super().__init__()
-        inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2).float() / head_dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.head_dim = head_dim
-        self.seq_len_cached = None
-        self.batch_size_cached = None
-        self.cos_cached: torch.Tensor | None = None
-        self.sin_cached: torch.Tensor | None = None
-    def cos_sin(
-        self,
-        seq_len: int,
-        device="cuda",
-        dtype=torch.bfloat16,
-    ) -> torch.Tensor:
-        if seq_len != self.seq_len_cached:
-            self.seq_len_cached = seq_len
-            t = torch.arange(seq_len, device=device).type_as(self.inv_freq)
-            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            emb = torch.cat((freqs, freqs), dim=-1).to(device)
-            if dtype in [torch.float16, torch.bfloat16]:
-                emb = emb.float()
-            self.cos_cached = emb.cos()[None, :, :]
-            self.sin_cached = emb.sin()[None, :, :]
-            self.cos_cached = self.cos_cached.type(dtype)
-            self.sin_cached = self.sin_cached.type(dtype)
-        return self.cos_cached, self.sin_cached
-    def forward(self, q, k):
-        batch, seq_len, head_dim = q.shape
-        cos, sin = self.cos_sin(seq_len, q.device, q.dtype)
-        return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
-def _make_causal_mask(input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int) -> torch.BoolTensor:
-    batch_size, target_length = input_ids_shape
-    mask = torch.empty((target_length, target_length + past_key_values_length), dtype=torch.bool, device=device)
-    # ONNX doesn't support `torch.Tensor.triu` properly, thus we use this workaround
-    seq_ids = torch.arange(target_length, device=device)
-    mask[:, past_key_values_length:] = seq_ids[:, None] < seq_ids[None, :]
-    if past_key_values_length > 0:
-        mask[:, :past_key_values_length] = False
-    expanded_mask = mask[None, None, :, :].expand(batch_size, 1, target_length, target_length + past_key_values_length)
-    return expanded_mask
-def _expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor:
-    batch_size, src_length = mask.shape
-    tgt_length = tgt_length if tgt_length is not None else src_length
-    expanded_mask = ~(mask[:, None, None, :].to(torch.bool))
-    return expanded_mask.expand(batch_size, 1, tgt_length, src_length)
-def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor:
-    batch_size, seq_length = attention_mask.shape
-    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
-    base = torch.tensor(2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32)
-    powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
-    slopes = torch.pow(base, powers)
-    if closest_power_of_2 != num_heads:
-        extra_base = torch.tensor(2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32)
-        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
-        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
-        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
-    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
-    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
-    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
-    # => the query_length dimension will then be broadcasted correctly
-    # This is more or less identical to T5's relative position bias:
-    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
-    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
-    alibi = slopes[..., None].bfloat16() * arange_tensor
-    return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
-def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor:
-    out = F.dropout(x, p=prob, training=training)
-    out = residual + out
-    return out
-class Attention(nn.Module):
-    def __init__(self, config: RWConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.n_head
-        self.head_dim = self.hidden_size // self.num_heads
-        self.split_size = self.hidden_size
-        self.hidden_dropout = config.hidden_dropout
-        if self.head_dim * self.num_heads != self.hidden_size:
-            raise ValueError(f"`hidden_size` must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:" f" {self.num_heads}).")
-        self.maybe_rotary = RotaryEmbedding(config.head_dim) if config.rotary else lambda q, k: (q, k)
-        # Layer-wise attention scaling
-        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
-        self.beta = self.inv_norm_factor
-        self.query_key_value = Linear(
-            self.hidden_size,
-            3 * self.hidden_size if not config.multi_query else (self.hidden_size + 2 * self.head_dim),
-            bias=config.bias,
-        )
-        self.multi_query = config.multi_query
-        self.dense = Linear(self.hidden_size, self.hidden_size, bias=config.bias)
-        self.attention_dropout = nn.Dropout(config.attention_dropout)
-        self.num_kv = config.n_head if not self.multi_query else 1
-    def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory
-        storage as `fused_qkv`
-        Args:
-            fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
-        Returns:
-            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
-            value: [batch_size, seq_length, num_heads, head_dim]
-        """
-        if not self.multi_query:
-            batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
-            fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim)
-            return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :]
-        else:
-            batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
-            fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads + 2, self.head_dim)
-            return fused_qkv[..., :-2, :], fused_qkv[..., [-2], :], fused_qkv[..., [-1], :]
-    def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Merge heads together over the last dimenstion
-        Args:
-            x: (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
-        Returns:
-            torch.tensor: [batch_size, seq_length, num_heads * head_dim]
-        """
-        # What we want to achieve is:
-        # batch_size * num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads * head_dim
-        batch_size_and_num_heads, seq_length, _ = x.shape
-        batch_size = batch_size_and_num_heads // self.num_heads
-        # First view to decompose the batch size
-        # batch_size * num_heads, seq_length, head_dim -> batch_size, num_heads, seq_length, head_dim
-        x = x.view(batch_size, self.num_heads, seq_length, self.head_dim)
-        # batch_size, num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads, head_dim
-        x = x.permute(0, 2, 1, 3)
-        # batch_size, seq_length, num_heads, head_dim -> batch_size, seq_length, num_heads * head_dim
-        return x.reshape(batch_size, seq_length, self.num_heads * self.head_dim)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        alibi: torch.Tensor,
-        attention_mask: torch.Tensor,
-        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        use_cache: bool = False,
-        output_attentions: bool = False,
-    ):
-        fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
-        # 3 x [batch_size, seq_length, num_heads, head_dim]
-        (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
-        batch_size, q_length, _, _ = query_layer.shape
-        query_layer = query_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim)
-        key_layer = key_layer.transpose(1, 2).reshape(
-            batch_size * self.num_kv,
-            q_length,
-            self.head_dim,
-        )
-        value_layer = value_layer.transpose(1, 2).reshape(batch_size * self.num_kv, q_length, self.head_dim)
-        query_layer, key_layer = self.maybe_rotary(query_layer, key_layer)
-        if layer_past is not None:
-            past_key, past_value = layer_past
-            # concatenate along seq_length dimension:
-            #  - key: [batch_size * self.num_heads, head_dim, kv_length]
-            #  - value: [batch_size * self.num_heads, kv_length, head_dim]
-            key_layer = torch.cat((past_key, key_layer), dim=1)
-            value_layer = torch.cat((past_value, value_layer), dim=1)
-        _, kv_length, _ = key_layer.shape
-        if use_cache is True:
-            present = (key_layer, value_layer)
-        else:
-            present = None
-        if alibi is None:
-            query_layer_ = query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim)
-            key_layer_ = key_layer.reshape(batch_size, self.num_kv, -1, self.head_dim)
-            value_layer_ = value_layer.reshape(batch_size, self.num_kv, -1, self.head_dim)
-            attn_output = F.scaled_dot_product_attention(query_layer_, key_layer_, value_layer_, None, 0.0, is_causal=True)
-            x = attn_output.view(batch_size, self.num_heads, q_length, self.head_dim)
-            x = x.permute(0, 2, 1, 3)
-            attn_output = x.reshape(batch_size, q_length, self.num_heads * self.head_dim)
-            output_tensor = self.dense(attn_output)
-            outputs = (output_tensor, present)
-            assert not output_attentions  # not supported.
-            return outputs
-        else:
-            attention_mask_float = (attention_mask * 1.0).masked_fill(attention_mask, -1e9).to(torch.bfloat16)
-            matmul_result = query_layer @ key_layer.transpose(-1, -2)
-            # change view to [batch_size, num_heads, q_length, kv_length]
-            attention_scores = matmul_result.view(batch_size, self.num_heads, q_length, kv_length)
-            # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
-            input_dtype = attention_scores.dtype
-            # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
-            if input_dtype == torch.float16 or input_dtype == torch.bfloat16:
-                attention_scores = attention_scores.to(torch.float32)
-            # attn_weights = torch.masked_fill(attention_scores, attention_mask, torch.finfo(attention_scores.dtype).min)
-            attention_probs = F.softmax(
-                (attention_scores + alibi.view(batch_size, self.num_heads, 1, -1)) * self.inv_norm_factor + attention_mask_float,
-                dim=-1,
-                dtype=hidden_states.dtype,
-            )
-            # [batch_size, num_heads, q_length, kv_length]
-            attention_probs = self.attention_dropout(attention_probs)
-            if head_mask is not None:
-                attention_probs = attention_probs * head_mask
-            # change view [batch_size x num_heads, q_length, kv_length]
-            attention_probs_reshaped = attention_probs.view(batch_size * self.num_heads, q_length, kv_length)
-            # matmul: [batch_size * num_heads, q_length, head_dim]
-            context_layer = attention_probs_reshaped @ value_layer
-            # change view [batch_size, num_heads, q_length, head_dim]
-            context_layer = self._merge_heads(context_layer)
-            output_tensor = self.dense(context_layer)
-            outputs = (output_tensor, present)
-            if output_attentions:
-                outputs += (attention_probs,)
-            return outputs
-class MLP(nn.Module):
-    def __init__(self, config: RWConfig):
-        super().__init__()
-        hidden_size = config.hidden_size
-        self.dense_h_to_4h = Linear(hidden_size, 4 * hidden_size, bias=config.bias)
-        self.act = nn.GELU()
-        self.dense_4h_to_h = Linear(4 * hidden_size, hidden_size, bias=config.bias)
-        self.hidden_dropout = config.hidden_dropout
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.act(self.dense_h_to_4h(x))
-        x = self.dense_4h_to_h(x)
-        return x
-class DecoderLayer(nn.Module):
-    def __init__(self, config: RWConfig):
-        super().__init__()
-        hidden_size = config.hidden_size
-        self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.num_heads = config.n_head
-        self.self_attention = Attention(config)
-        if not config.parallel_attn:
-            # unused if parallel attn
-            self.post_attention_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.mlp = MLP(config)
-        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
-        self.hidden_dropout = config.hidden_dropout
-        self.config = config
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        alibi: torch.Tensor,
-        attention_mask: torch.Tensor,
-        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        use_cache: bool = False,
-        output_attentions: bool = False,
-    ):
-        layernorm_output = self.input_layernorm(hidden_states)
-        residual = hidden_states
-        # Self attention.
-        attn_outputs = self.self_attention(
-            layernorm_output,
-            layer_past=layer_past,
-            attention_mask=attention_mask,
-            alibi=alibi,
-            head_mask=head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        attention_output = attn_outputs[0]
-        if not self.config.parallel_attn:
-            residual = dropout_add(attention_output, residual, self.config.attention_dropout, training=self.training)
-            layernorm_output = self.post_attention_layernorm(residual)
-        outputs = attn_outputs[1:]
-        # MLP.
-        mlp_output = self.mlp(layernorm_output)
-        if self.config.parallel_attn:
-            mlp_output += attention_output
-        output = dropout_add(mlp_output, residual, self.config.hidden_dropout, training=self.training)
-        if use_cache:
-            outputs = (output,) + outputs
-        else:
-            outputs = (output,) + outputs[1:]
-        return outputs  # hidden_states, present, attentions
-class RWPreTrainedModel(PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-    config_class = RWConfig
-    base_model_prefix = "transformer"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["DecoderLayer"]
-    def __init__(self, *inputs, **kwargs):
-        super().__init__(*inputs, **kwargs)
-    def _init_weights(self, module: nn.Module):
-        """Initialize the weights."""
-        if isinstance(module, nn.Linear) or isinstance(module, Linear):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-    def _set_gradient_checkpointing(self, module: nn.Module, value: bool = False):
-        if isinstance(module, RWModel):
-            module.gradient_checkpointing = value
-    @staticmethod
-    def _convert_to_standard_cache(past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]], batch_size: int) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
-        """
-        Standardizes the format of the cache so as to match most implementations, i.e. to tuple(tuple([batch_size,
-        num_heads, ...]))
-        """
-        batch_size_times_num_heads, head_dim, seq_length = past_key_value[0][0].shape
-        num_heads = batch_size_times_num_heads // batch_size
-        # key: [batch_size * num_heads, head_dim, seq_length] -> [batch_size, num_heads, head_dim, seq_length]
-        # value: [batch_size * num_heads, seq_length, head_dim] -> [batch_size, num_heads, seq_length, head_dim]
-        return tuple(
-            (
-                layer_past[0].view(batch_size, num_heads, head_dim, seq_length),
-                layer_past[1].view(batch_size, num_heads, seq_length, head_dim),
-            )
-            for layer_past in past_key_value
-        )
-    @staticmethod
-    def _convert_to_rw_cache(past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]]) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
-        batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
-        batch_size_times_num_heads = batch_size * num_heads
-        # key:  [batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length]
-        # value: [batch_size, num_heads, seq_length, head_dim] -> [batch_size * num_heads, seq_length, head_dim]
-        return tuple(
-            (
-                layer_past[0].view(batch_size_times_num_heads, head_dim, seq_length),
-                layer_past[1].view(batch_size_times_num_heads, seq_length, head_dim),
-            )
-            for layer_past in past_key_value
-        )
-class RWModel(RWPreTrainedModel):
-    def __init__(self, config: RWConfig):
-        super().__init__(config)
-        self.embed_dim = config.hidden_size
-        self.num_heads = config.n_head
-        self.alibi = config.alibi
-        # Embedding + LN Embedding
-        self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim)
-        # Transformer blocks
-        self.h = nn.ModuleList([DecoderLayer(config) for _ in range(config.num_hidden_layers)])
-        # Final Layer Norm
-        self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-    def get_input_embeddings(self):
-        return self.word_embeddings
-    def _prepare_attn_mask(self, attention_mask: torch.Tensor, input_shape: Tuple[int, int], past_key_values_length: int) -> torch.BoolTensor:
-        # create causal mask
-        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
-        combined_attention_mask = None
-        device = attention_mask.device
-        _, src_length = input_shape
-        if src_length > 1:
-            combined_attention_mask = _make_causal_mask(input_shape, device=device, past_key_values_length=past_key_values_length)
-        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
-        expanded_attn_mask = _expand_mask(attention_mask, tgt_length=src_length)
-        combined_attention_mask = expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask
-        return combined_attention_mask
-    def set_input_embeddings(self, new_embeddings: torch.Tensor):
-        self.word_embeddings = new_embeddings
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
-        if deprecated_arguments.pop("position_ids", False) is not False:
-            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
-            warnings.warn(
-                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" " passing `position_ids`.",
-                FutureWarning,
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-        if past_key_values is None:
-            past_key_values = tuple([None] * len(self.h))
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape batch_size x num_heads x N x N
-        # head_mask has shape n_layer x batch x num_heads x N x N
-        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        hidden_states = inputs_embeds
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
-        # Compute alibi tensor: check build_alibi_tensor documentation
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-        if past_key_values[0] is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-        if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device)
-        else:
-            attention_mask = attention_mask.to(hidden_states.device)
-        if self.alibi:
-            alibi = build_alibi_tensor(attention_mask, self.num_heads, dtype=hidden_states.dtype)
-        else:
-            alibi = None
-        causal_mask = self._prepare_attn_mask(
-            attention_mask,
-            input_shape=(batch_size, seq_length),
-            past_key_values_length=past_key_values_length,
-        )
-        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-            if self.gradient_checkpointing and self.training:
-                if use_cache:
-                    logger.warning("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
-                    use_cache = False
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, use_cache=use_cache, output_attentions=output_attentions)
-                    return custom_forward
-                outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    alibi,
-                    causal_mask,
-                    head_mask[i],
-                )
-            else:
-                outputs = block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=causal_mask,
-                    head_mask=head_mask[i],
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                    alibi=alibi,
-                )
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
-        # Add last hidden state
-        hidden_states = self.ln_f(hidden_states)
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-        if not return_dict:
-            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-class RWForCausalLM(RWPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-    def __init__(self, config: RWConfig):
-        super().__init__(config)
-        self.transformer = RWModel(config)
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def get_output_embeddings(self):
-        return self.lm_head
-    def set_output_embeddings(self, new_embeddings: torch.Tensor):
-        self.lm_head = new_embeddings
-    def prepare_inputs_for_generation(
-        self,
-        input_ids: torch.LongTensor,
-        past: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> dict:
-        # only last token for input_ids if past is not None
-        if past:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-            # the cache may be in the stardard format (e.g. in contrastive search), convert to our's format if needed
-            if past[0][0].shape[0] == input_ids.shape[0]:
-                past = self._convert_to_rw_cache(past)
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past,
-            "use_cache": kwargs.get("use_cache"),
-            "attention_mask": attention_mask,
-        }
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
-            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
-        """
-        if deprecated_arguments.pop("position_ids", False) is not False:
-            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
-            warnings.warn(
-                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" " passing `position_ids`.",
-                FutureWarning,
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        lm_logits = self.lm_head(hidden_states)
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            batch_size, seq_length, vocab_size = shift_logits.shape
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length))
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-        return CausalLMOutputWithCrossAttentions(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-    def _reorder_cache(self, past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
-        """
-        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
-        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
-        beam_idx at every generation step.
-        Output shares the same memory storage as `past`.
-        """
-        standardized_past = self._convert_to_standard_cache(past, batch_size=len(beam_idx))
-        # Get a copy of `beam_idx` on all the devices where we need those indices.
-        device_to_beam_idx = {past_state.device: beam_idx.to(past_state.device) for layer_past in past for past_state in layer_past}
-        reordered_past = tuple(
-            (
-                layer_past[0].index_select(0, device_to_beam_idx[layer_past[0].device]),
-                layer_past[1].index_select(0, device_to_beam_idx[layer_past[0].device]),
-            )
-            for layer_past in standardized_past
-        )
-        return self._convert_to_rw_cache(reordered_past)
-class RWForSequenceClassification(RWPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-    def __init__(self, config: RWConfig):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.transformer = RWModel(config)
-        self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        if deprecated_arguments.pop("position_ids", False) is not False:
-            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
-            warnings.warn(
-                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" " passing `position_ids`.",
-                FutureWarning,
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(dim=-1) - 1
-            else:
-                sequence_lengths = -1
-                logger.warning(
-                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
-                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
-                )
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-class RWForTokenClassification(RWPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-    def __init__(self, config: RWConfig):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.transformer = RWModel(config)
-        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
-            classifier_dropout = config.classifier_dropout
-        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
-            classifier_dropout = config.hidden_dropout
-        else:
-            classifier_dropout = 0.1
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        if deprecated_arguments.pop("position_ids", False) is not False:
-            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
-            warnings.warn(
-                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" " passing `position_ids`.",
-                FutureWarning,
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        hidden_states = self.dropout(hidden_states)
-        logits = self.classifier(hidden_states)
-        loss = None
-        if labels is not None:
-            batch_size, seq_length = labels.shape
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length))
-        if not return_dict:
-            output = (logits,) + transformer_outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-        return TokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-class RWForQuestionAnswering(RWPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-    def __init__(self, config):
-        super().__init__(config)
-        self.transformer = RWModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, 2)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        start_positions: Optional[torch.LongTensor] = None,
-        end_positions: Optional[torch.LongTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
-        r"""
-        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = outputs[0]
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1).contiguous()
-        end_logits = end_logits.squeeze(-1).contiguous()
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions = end_positions.clamp(0, ignored_index)
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
-            return ((total_loss,) + output) if total_loss is not None else output
-        return QuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )

mllm/flamingo/flamingo-falcon-7B.json DELETED Viewed

@@ -1,112 +0,0 @@
-{
-  "_commit_hash": null,
-  "architectures": [
-    "FlamingoModel"
-  ],
-  "cross_attn_every_n_layers": 4,
-  "model_type": "flamingo",
-  "text_config": {
-    "architectures": [
-      "RWForCausalLM"
-    ],
-    "apply_residual_connection_post_layernorm": false,
-    "attention_dropout": 0.0,
-    "bias": false,
-    "bos_token_id": 11,
-    "eos_token_id": 11,
-    "hidden_dropout": 0.0,
-    "hidden_size": 4544,
-    "initializer_range": 0.02,
-    "layer_norm_epsilon": 1e-05,
-    "model_type": "RefinedWebModel",
-    "multi_query": true,
-    "n_head": 71,
-    "n_layer": 32,
-    "parallel_attn": true,
-    "torch_dtype": "bfloat16",
-    "transformers_version": "4.27.4",
-    "use_cache": true,
-    "vocab_size": 65024
-  },
-  "tie_word_embeddings": false,
-  "torch_dtype": "float32",
-  "transformers_version": null,
-  "use_media_placement_augmentation": true,
-  "vision_config": {
-    "_name_or_path": "openai/clip-vit-large-patch14",
-    "add_cross_attention": false,
-    "architectures": null,
-    "attention_dropout": 0.0,
-    "bad_words_ids": null,
-    "begin_suppress_tokens": null,
-    "bos_token_id": null,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "early_stopping": false,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": null,
-    "exponential_decay_length_penalty": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "hidden_act": "quick_gelu",
-    "hidden_size": 1024,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
-    "image_size": 224,
-    "initializer_factor": 1.0,
-    "initializer_range": 0.02,
-    "intermediate_size": 4096,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
-    "layer_norm_eps": 1e-05,
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "min_length": 0,
-    "model_type": "clip_vision_model",
-    "no_repeat_ngram_size": 0,
-    "num_attention_heads": 16,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_channels": 3,
-    "num_hidden_layers": 24,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": null,
-    "patch_size": 14,
-    "prefix": null,
-    "problem_type": null,
-    "projection_dim": 512,
-    "pruned_heads": {},
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "sep_token_id": null,
-    "suppress_tokens": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tf_legacy_loss": false,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torch_dtype": null,
-    "torchscript": false,
-    "transformers_version": "4.28.1",
-    "typical_p": 1.0,
-    "use_bfloat16": false
-  }
-}

mllm/flamingo/flamingo-llama2-chat-13B.json DELETED Viewed

@@ -1,114 +0,0 @@
-{
-    "_commit_hash": null,
-    "architectures": [
-        "FlamingoForConditionalGeneration"
-    ],
-    "cross_attn_every_n_layers": 8,
-    "model_type": "flamingo",
-    "text_config": {
-        "_name_or_path": "meta-llama/Llama-2-13b-chat-hf",
-        "architectures": [
-            "LlamaForCausalLM"
-        ],
-        "bos_token_id": 1,
-        "eos_token_id": 2,
-        "hidden_act": "silu",
-        "hidden_size": 5120,
-        "initializer_range": 0.02,
-        "intermediate_size": 13824,
-        "max_position_embeddings": 4096,
-        "model_type": "llama",
-        "num_attention_heads": 40,
-        "num_hidden_layers": 40,
-        "num_key_value_heads": 40,
-        "pad_token_id": 0,
-        "pretraining_tp": 1,
-        "rms_norm_eps": 1e-05,
-        "rope_scaling": null,
-        "tie_word_embeddings": false,
-        "torch_dtype": "float16",
-        "transformers_version": "4.30.1",
-        "use_cache": true,
-        "vocab_size": 32000
-    },
-    "torch_dtype": "float32",
-    "transformers_version": null,
-    "use_media_placement_augmentation": true,
-    "vision_config": {
-        "_name_or_path": "openai/clip-vit-large-patch14",
-        "add_cross_attention": false,
-        "architectures": null,
-        "attention_dropout": 0.0,
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_act": "quick_gelu",
-        "hidden_size": 1024,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "image_size": 224,
-        "initializer_factor": 1.0,
-        "initializer_range": 0.02,
-        "intermediate_size": 4096,
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "layer_norm_eps": 1e-05,
-        "length_penalty": 1.0,
-        "max_length": 20,
-        "min_length": 0,
-        "model_type": "clip_vision_model",
-        "no_repeat_ngram_size": 0,
-        "num_attention_heads": 16,
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_channels": 3,
-        "num_hidden_layers": 24,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "patch_size": 14,
-        "prefix": null,
-        "problem_type": null,
-        "projection_dim": 512,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": null,
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false
-    }
-}

mllm/flamingo/flamingo-llama2-chat-7B.json DELETED Viewed

@@ -1,115 +0,0 @@
-{
-    "_commit_hash": null,
-    "architectures": [
-        "FlamingoForConditionalGeneration"
-    ],
-    "cross_attn_every_n_layers": 4,
-    "model_type": "flamingo",
-    "text_config": {
-        "_name_or_path": "meta-llama/Llama-2-7b-chat-hf",
-        "architectures": [
-            "LlamaForCausalLM"
-        ],
-        "bos_token_id": 1,
-        "eos_token_id": 2,
-        "hidden_act": "silu",
-        "hidden_size": 4096,
-        "initializer_range": 0.02,
-        "intermediate_size": 11008,
-        "max_length": 4096,
-        "max_position_embeddings": 2048,
-        "model_type": "llama",
-        "num_attention_heads": 32,
-        "num_hidden_layers": 32,
-        "num_key_value_heads": 32,
-        "pad_token_id": 0,
-        "pretraining_tp": 1,
-        "rms_norm_eps": 1e-05,
-        "rope_scaling": null,
-        "tie_word_embeddings": false,
-        "torch_dtype": "float16",
-        "transformers_version": "4.32.0.dev0",
-        "use_cache": true,
-        "vocab_size": 32000
-    },
-    "torch_dtype": "float32",
-    "transformers_version": null,
-    "use_media_placement_augmentation": true,
-    "vision_config": {
-        "_name_or_path": "openai/clip-vit-large-patch14",
-        "add_cross_attention": false,
-        "architectures": null,
-        "attention_dropout": 0.0,
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_act": "quick_gelu",
-        "hidden_size": 1024,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "image_size": 224,
-        "initializer_factor": 1.0,
-        "initializer_range": 0.02,
-        "intermediate_size": 4096,
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "layer_norm_eps": 1e-05,
-        "length_penalty": 1.0,
-        "max_length": 20,
-        "min_length": 0,
-        "model_type": "clip_vision_model",
-        "no_repeat_ngram_size": 0,
-        "num_attention_heads": 16,
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_channels": 3,
-        "num_hidden_layers": 24,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "patch_size": 14,
-        "prefix": null,
-        "problem_type": null,
-        "projection_dim": 512,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": null,
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false
-    }
-}

mllm/flamingo/flamingo-mpt-1B-redpajama.json DELETED Viewed

@@ -1,131 +0,0 @@
-{
-    "_commit_hash": null,
-    "architectures": [
-        "FlamingoForConditionalGeneration"
-    ],
-    "cross_attn_every_n_layers": 1,
-    "model_type": "flamingo",
-    "text_config": {
-        "_name_or_path": "",
-        "alibi": true,
-        "alibi_bias_max": 8,
-        "architectures": [
-          "MosaicGPT"
-        ],
-        "attn_clip_qkv": null,
-        "attn_impl": "torch",
-        "attn_pdrop": 0,
-        "attn_qk_ln": true,
-        "attn_uses_sequence_id": false,
-        "d_model": 2048,
-        "hidden_size": 2048,
-        "emb_init_std": null,
-        "emb_init_uniform_lim": null,
-        "emb_pdrop": 0,
-        "embedding_fraction": 1.0,
-        "fan_mode": "fan_in",
-        "init_device": "cpu",
-        "init_div_is_residual": true,
-        "init_gain": 0,
-        "init_nonlinearity": "relu",
-        "init_std": 0.02,
-        "logit_scale": null,
-        "low_precision_layernorm": true,
-        "max_seq_len": 2048,
-        "mlp_ratio": 4,
-        "model_type": "mosaic_gpt",
-        "n_heads": 16,
-        "n_layers": 24,
-        "no_bias": true,
-        "param_init_fn": "kaiming_normal_",
-        "prefix_lm": false,
-        "resid_pdrop": 0,
-        "softmax_scale": null,
-        "tokenizer_name": "EleutherAI/gpt-neox-20b",
-        "torch_dtype": "float32",
-        "transformers_version": "4.27.4",
-        "use_cache": false,
-        "verbose": 0,
-        "vocab_size": 50432
-      },
-    "torch_dtype": "float32",
-    "transformers_version": null,
-    "use_media_placement_augmentation": true,
-    "vision_config": {
-        "_name_or_path": "openai/clip-vit-large-patch14",
-        "add_cross_attention": false,
-        "architectures": null,
-        "attention_dropout": 0.0,
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_act": "quick_gelu",
-        "hidden_size": 1024,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "image_size": 224,
-        "initializer_factor": 1.0,
-        "initializer_range": 0.02,
-        "intermediate_size": 4096,
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "layer_norm_eps": 1e-05,
-        "length_penalty": 1.0,
-        "max_length": 20,
-        "min_length": 0,
-        "model_type": "clip_vision_model",
-        "no_repeat_ngram_size": 0,
-        "num_attention_heads": 16,
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_channels": 3,
-        "num_hidden_layers": 24,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "patch_size": 14,
-        "prefix": null,
-        "problem_type": null,
-        "projection_dim": 512,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": null,
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false
-    }
-}

mllm/flamingo/flamingo-mpt-30B-bf16.json DELETED Viewed

@@ -1,195 +0,0 @@
-{
-    "_commit_hash": null,
-    "architectures": [
-        "FlamingoForConditionalGeneration"
-    ],
-    "cross_attn_every_n_layers": 7,
-    "model_type": "flamingo",
-    "text_config": {
-        "_name_or_path": "",
-        "add_cross_attention": false,
-        "architectures": [
-            "MPTForCausalLM"
-        ],
-        "attn_config": {
-            "alibi": true,
-            "alibi_bias_max": 8,
-            "attn_impl": "torch",
-            "attn_pdrop": 0,
-            "attn_type": "multihead_attention",
-            "attn_uses_sequence_id": false,
-            "clip_qkv": null,
-            "prefix_lm": false,
-            "qk_ln": false,
-            "softmax_scale": null
-        },
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "d_model": 7168,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "emb_pdrop": 0,
-        "embedding_fraction": 1.0,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "expansion_ratio": 4,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_size": 7168,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "init_config": {
-            "emb_init_std": null,
-            "emb_init_uniform_lim": null,
-            "fan_mode": "fan_in",
-            "init_div_is_residual": true,
-            "init_gain": 0.0,
-            "init_nonlinearity": "relu",
-            "init_std": null,
-            "name": "kaiming_normal_",
-            "verbose": 0
-        },
-        "init_device": "cpu",
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "learned_pos_emb": true,
-        "length_penalty": 1.0,
-        "logit_scale": null,
-        "max_length": 20,
-        "max_seq_len": 8192,
-        "min_length": 0,
-        "model_type": "mpt",
-        "n_heads": 64,
-        "n_layers": 48,
-        "no_bias": true,
-        "no_repeat_ngram_size": 0,
-        "norm_type": "low_precision_layernorm",
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "prefix": null,
-        "problem_type": null,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "resid_pdrop": 0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "tokenizer_name": "EleutherAI/gpt-neox-20b",
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": "bfloat16",
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false,
-        "use_cache": false,
-        "verbose": 0,
-        "vocab_size": 50432
-    },
-    "torch_dtype": "bfloat16",
-    "transformers_version": null,
-    "use_media_placement_augmentation": true,
-    "vision_config": {
-        "_name_or_path": "openai/clip-vit-large-patch14",
-        "add_cross_attention": false,
-        "architectures": null,
-        "attention_dropout": 0.0,
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_act": "quick_gelu",
-        "hidden_size": 1024,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "image_size": 224,
-        "initializer_factor": 1.0,
-        "initializer_range": 0.02,
-        "intermediate_size": 4096,
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "layer_norm_eps": 1e-05,
-        "length_penalty": 1.0,
-        "max_length": 20,
-        "min_length": 0,
-        "model_type": "clip_vision_model",
-        "no_repeat_ngram_size": 0,
-        "num_attention_heads": 16,
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_channels": 3,
-        "num_hidden_layers": 24,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "patch_size": 14,
-        "prefix": null,
-        "problem_type": null,
-        "projection_dim": 512,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": null,
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false
-    }
-}

mllm/flamingo/flamingo-mpt-30B.json DELETED Viewed

@@ -1,195 +0,0 @@
-{
-    "_commit_hash": null,
-    "architectures": [
-        "FlamingoForConditionalGeneration"
-    ],
-    "cross_attn_every_n_layers": 7,
-    "model_type": "flamingo",
-    "text_config": {
-        "_name_or_path": "",
-        "add_cross_attention": false,
-        "architectures": [
-            "MPTForCausalLM"
-        ],
-        "attn_config": {
-            "alibi": true,
-            "alibi_bias_max": 8,
-            "attn_impl": "torch",
-            "attn_pdrop": 0,
-            "attn_type": "multihead_attention",
-            "attn_uses_sequence_id": false,
-            "clip_qkv": null,
-            "prefix_lm": false,
-            "qk_ln": false,
-            "softmax_scale": null
-        },
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "d_model": 7168,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "emb_pdrop": 0,
-        "embedding_fraction": 1.0,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "expansion_ratio": 4,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_size": 7168,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "init_config": {
-            "emb_init_std": null,
-            "emb_init_uniform_lim": null,
-            "fan_mode": "fan_in",
-            "init_div_is_residual": true,
-            "init_gain": 0.0,
-            "init_nonlinearity": "relu",
-            "init_std": null,
-            "name": "kaiming_normal_",
-            "verbose": 0
-        },
-        "init_device": "cpu",
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "learned_pos_emb": true,
-        "length_penalty": 1.0,
-        "logit_scale": null,
-        "max_length": 20,
-        "max_seq_len": 8192,
-        "min_length": 0,
-        "model_type": "mpt",
-        "n_heads": 64,
-        "n_layers": 48,
-        "no_bias": true,
-        "no_repeat_ngram_size": 0,
-        "norm_type": "low_precision_layernorm",
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "prefix": null,
-        "problem_type": null,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "resid_pdrop": 0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "tokenizer_name": "EleutherAI/gpt-neox-20b",
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": "bfloat16",
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false,
-        "use_cache": false,
-        "verbose": 0,
-        "vocab_size": 50432
-    },
-    "torch_dtype": "float32",
-    "transformers_version": null,
-    "use_media_placement_augmentation": true,
-    "vision_config": {
-        "_name_or_path": "openai/clip-vit-large-patch14",
-        "add_cross_attention": false,
-        "architectures": null,
-        "attention_dropout": 0.0,
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_act": "quick_gelu",
-        "hidden_size": 1024,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "image_size": 224,
-        "initializer_factor": 1.0,
-        "initializer_range": 0.02,
-        "intermediate_size": 4096,
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "layer_norm_eps": 1e-05,
-        "length_penalty": 1.0,
-        "max_length": 20,
-        "min_length": 0,
-        "model_type": "clip_vision_model",
-        "no_repeat_ngram_size": 0,
-        "num_attention_heads": 16,
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_channels": 3,
-        "num_hidden_layers": 24,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "patch_size": 14,
-        "prefix": null,
-        "problem_type": null,
-        "projection_dim": 512,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": null,
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false
-    }
-}

mllm/flamingo/flamingo-mpt-7B.json DELETED Viewed

@@ -1,195 +0,0 @@
-{
-    "_commit_hash": null,
-    "architectures": [
-        "FlamingoForConditionalGeneration"
-    ],
-    "cross_attn_every_n_layers": 4,
-    "model_type": "flamingo",
-    "text_config": {
-        "_name_or_path": "",
-        "add_cross_attention": false,
-        "architectures": [
-            "MPTForCausalLM"
-        ],
-        "attn_config": {
-            "alibi": true,
-            "alibi_bias_max": 8,
-            "attn_impl": "torch",
-            "attn_pdrop": 0,
-            "attn_type": "multihead_attention",
-            "attn_uses_sequence_id": false,
-            "clip_qkv": null,
-            "prefix_lm": false,
-            "qk_ln": false,
-            "softmax_scale": null
-        },
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "d_model": 4096,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "emb_pdrop": 0,
-        "embedding_fraction": 1.0,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "expansion_ratio": 4,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_size": 4096,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "init_config": {
-            "emb_init_std": null,
-            "emb_init_uniform_lim": null,
-            "fan_mode": "fan_in",
-            "init_div_is_residual": true,
-            "init_gain": 0,
-            "init_nonlinearity": "relu",
-            "init_std": 0.02,
-            "name": "kaiming_normal_",
-            "verbose": 0
-        },
-        "init_device": "cpu",
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "learned_pos_emb": true,
-        "length_penalty": 1.0,
-        "logit_scale": null,
-        "max_length": 20,
-        "max_seq_len": 2048,
-        "min_length": 0,
-        "model_type": "mpt",
-        "n_heads": 32,
-        "n_layers": 32,
-        "no_bias": true,
-        "no_repeat_ngram_size": 0,
-        "norm_type": "low_precision_layernorm",
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "prefix": null,
-        "problem_type": null,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "resid_pdrop": 0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "tokenizer_name": "EleutherAI/gpt-neox-20b",
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": "bfloat16",
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false,
-        "use_cache": false,
-        "verbose": 0,
-        "vocab_size": 50432
-    },
-    "torch_dtype": "float32",
-    "transformers_version": null,
-    "use_media_placement_augmentation": true,
-    "vision_config": {
-        "_name_or_path": "openai/clip-vit-large-patch14",
-        "add_cross_attention": false,
-        "architectures": null,
-        "attention_dropout": 0.0,
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_act": "quick_gelu",
-        "hidden_size": 1024,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "image_size": 224,
-        "initializer_factor": 1.0,
-        "initializer_range": 0.02,
-        "intermediate_size": 4096,
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "layer_norm_eps": 1e-05,
-        "length_penalty": 1.0,
-        "max_length": 20,
-        "min_length": 0,
-        "model_type": "clip_vision_model",
-        "no_repeat_ngram_size": 0,
-        "num_attention_heads": 16,
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_channels": 3,
-        "num_hidden_layers": 24,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "patch_size": 14,
-        "prefix": null,
-        "problem_type": null,
-        "projection_dim": 512,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": null,
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false
-    }
-}

mllm/flamingo/flamingo-vicuna-33B-v1.3.json DELETED Viewed

@@ -1,111 +0,0 @@
-{
-    "_commit_hash": null,
-    "architectures": [
-        "FlamingoForConditionalGeneration"
-    ],
-    "cross_attn_every_n_layers": 4,
-    "model_type": "flamingo",
-    "text_config": {
-        "_name_or_path": "/home/luodian/projects/checkpoints/vicuna-33b-v1.3",
-        "architectures": [
-            "LlamaForCausalLM"
-        ],
-        "bos_token_id": 1,
-        "eos_token_id": 2,
-        "hidden_act": "silu",
-        "hidden_size": 6656,
-        "initializer_range": 0.02,
-        "intermediate_size": 17920,
-        "max_position_embeddings": 2048,
-        "model_type": "llama",
-        "num_attention_heads": 52,
-        "num_hidden_layers": 60,
-        "pad_token_id": 0,
-        "rms_norm_eps": 1e-06,
-        "tie_word_embeddings": false,
-        "torch_dtype": "float16",
-        "transformers_version": "4.28.1",
-        "use_cache": false,
-        "vocab_size": 32000
-    },
-    "torch_dtype": "float32",
-    "transformers_version": null,
-    "use_media_placement_augmentation": true,
-    "vision_config": {
-        "_name_or_path": "openai/clip-vit-large-patch14",
-        "add_cross_attention": false,
-        "architectures": null,
-        "attention_dropout": 0.0,
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_act": "quick_gelu",
-        "hidden_size": 1024,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "image_size": 224,
-        "initializer_factor": 1.0,
-        "initializer_range": 0.02,
-        "intermediate_size": 4096,
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "layer_norm_eps": 1e-05,
-        "length_penalty": 1.0,
-        "max_length": 20,
-        "min_length": 0,
-        "model_type": "clip_vision_model",
-        "no_repeat_ngram_size": 0,
-        "num_attention_heads": 16,
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_channels": 3,
-        "num_hidden_layers": 24,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "patch_size": 14,
-        "prefix": null,
-        "problem_type": null,
-        "projection_dim": 512,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": null,
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false
-    }
-}

mllm/flamingo/flamingo-vicuna-7B-v1.3.json DELETED Viewed

@@ -1,111 +0,0 @@
-{
-    "_commit_hash": null,
-    "architectures": [
-        "FlamingoForConditionalGeneration"
-    ],
-    "cross_attn_every_n_layers": 4,
-    "model_type": "flamingo",
-    "text_config": {
-        "_name_or_path": "/mnt/petrelfs/share_data/zhangyuanhan/vicuna-7b-v1.3",
-        "architectures": [
-            "LlamaForCausalLM"
-        ],
-        "bos_token_id": 1,
-        "eos_token_id": 2,
-        "hidden_act": "silu",
-        "hidden_size": 4096,
-        "initializer_range": 0.02,
-        "intermediate_size": 11008,
-        "max_position_embeddings": 2048,
-        "model_type": "llama",
-        "num_attention_heads": 32,
-        "num_hidden_layers": 32,
-        "pad_token_id": 0,
-        "rms_norm_eps": 1e-06,
-        "tie_word_embeddings": false,
-        "torch_dtype": "float16",
-        "transformers_version": "4.28.1",
-        "use_cache": false,
-        "vocab_size": 32000
-    },
-    "torch_dtype": "float32",
-    "transformers_version": null,
-    "use_media_placement_augmentation": true,
-    "vision_config": {
-        "_name_or_path": "openai/clip-vit-large-patch14",
-        "add_cross_attention": false,
-        "architectures": null,
-        "attention_dropout": 0.0,
-        "bad_words_ids": null,
-        "begin_suppress_tokens": null,
-        "bos_token_id": null,
-        "chunk_size_feed_forward": 0,
-        "cross_attention_hidden_size": null,
-        "decoder_start_token_id": null,
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": false,
-        "encoder_no_repeat_ngram_size": 0,
-        "eos_token_id": null,
-        "exponential_decay_length_penalty": null,
-        "finetuning_task": null,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "hidden_act": "quick_gelu",
-        "hidden_size": 1024,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "image_size": 224,
-        "initializer_factor": 1.0,
-        "initializer_range": 0.02,
-        "intermediate_size": 4096,
-        "is_decoder": false,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "layer_norm_eps": 1e-05,
-        "length_penalty": 1.0,
-        "max_length": 20,
-        "min_length": 0,
-        "model_type": "clip_vision_model",
-        "no_repeat_ngram_size": 0,
-        "num_attention_heads": 16,
-        "num_beam_groups": 1,
-        "num_beams": 1,
-        "num_channels": 3,
-        "num_hidden_layers": 24,
-        "num_return_sequences": 1,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_scores": false,
-        "pad_token_id": null,
-        "patch_size": 14,
-        "prefix": null,
-        "problem_type": null,
-        "projection_dim": 512,
-        "pruned_heads": {},
-        "remove_invalid_values": false,
-        "repetition_penalty": 1.0,
-        "return_dict": true,
-        "return_dict_in_generate": false,
-        "sep_token_id": null,
-        "suppress_tokens": null,
-        "task_specific_params": null,
-        "temperature": 1.0,
-        "tf_legacy_loss": false,
-        "tie_encoder_decoder": false,
-        "tie_word_embeddings": true,
-        "tokenizer_class": null,
-        "top_k": 50,
-        "top_p": 1.0,
-        "torch_dtype": null,
-        "torchscript": false,
-        "transformers_version": "4.30.1",
-        "typical_p": 1.0,
-        "use_bfloat16": false
-    }
-}

mllm/flamingo/injecting_falcon_into_flamingo.py DELETED Viewed

@@ -1,49 +0,0 @@
-import os
-import torch
-from .configuration_flamingo import FlamingoConfig
-from .modeling_flamingo import FlamingoForConditionalGeneration
-root_dir = os.environ["AZP"]
-print(root_dir)
-config = FlamingoConfig.from_json_file(".flamingo-falcon-7B.json")
-model = FlamingoForConditionalGeneration(config=config)
-state_dict_files = [
-    f"{root_dir}/otter/checkpoints/falcon-7b/pytorch_model-00001-of-00002.bin",
-    f"{root_dir}/otter/checkpoints/falcon-7b/pytorch_model-00002-of-00002.bin",
-]
-state_dict = {}
-for file in state_dict_files:
-    state_dict_part = torch.load(file, map_location="cpu")
-    state_dict.update(state_dict_part)
-state_dict_3 = torch.load("{root_dir}/otter/checkpoints/flamingo_9b_hf/pytorch_model-00004-of-00004.bin", map_location="cpu")
-for cur_key in list(state_dict_3.keys()):
-    if "vision_encoder" not in cur_key:
-        del state_dict_3[cur_key]
-_ = model.load_state_dict(
-    state_dict_3,
-    False,
-)
-print(_[1])
-save_state_dict_1 = {}
-for key in state_dict:
-    if ".h." in key:
-        _, _, layer_num, *remain_names = key.split(".")
-        target_key = f"transformer.h.{layer_num}.decoder_layer.{'.'.join(remain_names)}"
-    else:
-        target_key = key
-    save_state_dict_1[f"{target_key}"] = state_dict[key]
-_ = model.lang_encoder.load_state_dict(
-    save_state_dict_1,
-    False,
-)
-print(_[1])
-model.save_pretrained(f"{root_dir}/otter/checkpoints/flamingo-falcon-7b/")

mllm/flamingo/injecting_llama2_into_flamingo.py DELETED Viewed

@@ -1,95 +0,0 @@
-import argparse
-import os
-import torch
-from tqdm import tqdm
-import sys
-from .configuration_flamingo import FlamingoConfig
-from .modeling_flamingo import FlamingoForConditionalGeneration
-# from .configuration_flamingo import FlamingoConfig
-# from .modeling_flamingo import FlamingoForConditionalGeneration
-parser = argparse.ArgumentParser(description="Convert Vicuna model")
-parser.add_argument("--model_choice", type=str, default="13B", help="Choose either '7B' or '13B'")
-parser.add_argument("--llama2_root_dir", type=str, default="/home/luodian/projects/checkpoints")
-parser.add_argument("--save_root_dir", type=str, default="/home/luodian/projects/checkpoints")
-args = parser.parse_args()
-# os.environ["TOKENIZERS_PARALLELISM"] = "false"
-root_dir = args.llama2_root_dir
-model_choice = args.model_choice
-save_root_dir = args.save_root_dir
-# prepare vicuna model at first
-# you can visit https://huggingface.co/lmsys/Llama-2-33b-chat-hf to download 7B and 30B instruct checkpoints.
-if model_choice == "7B":
-    config_file = "./flamingo/flamingo-llama2-chat-7B.json"
-    state_dict_files = [
-        f"{root_dir}/Llama-2-7b-chat-hf/pytorch_model-00001-of-00002.bin",
-        f"{root_dir}/Llama-2-7b-chat-hf/pytorch_model-00002-of-00002.bin",
-    ]
-    save_path = f"{save_root_dir}/flamingo-llama2-chat-7B-init"
-elif model_choice == "13B":
-    config_file = "./flamingo/flamingo-llama2-chat-13B.json"
-    state_dict_files = [
-        f"{root_dir}/Llama-2-13b-chat-hf/pytorch_model-00001-of-00003.bin",
-        f"{root_dir}/Llama-2-13b-chat-hf/pytorch_model-00002-of-00003.bin",
-        f"{root_dir}/Llama-2-13b-chat-hf/pytorch_model-00003-of-00003.bin",
-    ]
-    save_path = f"{save_root_dir}/flamingo-llama2-chat-13B-init"
-else:
-    raise ValueError("Invalid model_choice. Choose either '13B' or '7B'.")
-config = FlamingoConfig.from_json_file(config_file)
-model = FlamingoForConditionalGeneration(config=config)
-# load flamingo's vision encoder from last checkpoint.
-# you can visit https://huggingface.co/luodian/openflamingo-9b-hf/tree/main to download the checkpoint.
-# AZP = "os.environ["AZP"]"
-AZP = os.environ["AZP"]
-state_dict_3 = torch.load(f"{AZP}/otter/checkpoints/flamingo_9b_hf/pytorch_model-00004-of-00004.bin", map_location="cpu")
-for cur_key in list(state_dict_3.keys()):
-    if "vision_encoder" not in cur_key:
-        del state_dict_3[cur_key]
-load_msg = model.load_state_dict(
-    state_dict_3,
-    False,
-)
-# print incompatible keys
-print(load_msg[1])
-# Loading vicuna weights
-state_dict = {}
-for file in tqdm(state_dict_files, desc="Loading state dict"):
-    state_dict_part = torch.load(file, map_location="cpu")
-    state_dict.update(state_dict_part)
-save_state_dict_1 = {}
-for key in state_dict:
-    if ".layers." in key:
-        _, _, layer_num, *remain_names = key.split(".")
-        target_key = f"model.layers.{layer_num}.decoder_layer.{'.'.join(remain_names)}"
-    else:
-        target_key = key
-    save_state_dict_1[f"{target_key}"] = state_dict[key]
-# Reshape the token embedding to 50280 for compatible
-model.lang_encoder.resize_token_embeddings(32000)
-load_msg = model.lang_encoder.load_state_dict(
-    save_state_dict_1,
-    False,
-)
-# Reshape the token embedding to 32002 for compatible
-model.lang_encoder.resize_token_embeddings(32002)
-# print incompatible keys
-print(load_msg[1])
-print(f"Saving model to {save_path}...")
-model.save_pretrained(save_path, max_shard_size="10GB")

mllm/flamingo/injecting_mpt-1B-redpajama_into_flamingo.py DELETED Viewed

@@ -1,97 +0,0 @@
-import argparse
-import os
-import torch
-from tqdm import tqdm
-import sys
-from configuration_flamingo import FlamingoConfig
-from modeling_flamingo import FlamingoForConditionalGeneration
-from utils import rename_flamingo_checkpoint
-parser = argparse.ArgumentParser(description="Convert MPT model")
-parser.add_argument("--mpt_root_dir", type=str, default="/home/luodian/projects/checkpoints")
-parser.add_argument("--save_root_dir", type=str, default="/home/luodian/projects/checkpoints")
-parser.add_argument("--flamingo_dir", type=str, default=None, help="If the pretrained flamingo weights also need to be injected")
-args = parser.parse_args()
-root_dir = args.mpt_root_dir
-save_root_dir = args.save_root_dir
-# prepare mpt model at first
-# you can visit https://huggingface.co/mosaicml to download 7B and 30B instruct checkpoints.
-config_file = "./flamingo/flamingo-mpt-1B-redpajama.json"
-state_dict_file = f"{root_dir}/pytorch_model.bin"
-save_path = f"{save_root_dir}/flamingo-mpt-1b-redpajama-200b-dolly"
-config = FlamingoConfig.from_json_file(config_file)
-model = FlamingoForConditionalGeneration(config=config)
-# Loading mpt weights
-state_dict = torch.load(state_dict_file, map_location="cpu")
-save_state_dict_1 = {}
-for key in state_dict:
-    if ".blocks." in key:
-        _, _, layer_num, *remain_names = key.split(".")
-        target_key = f"transformer.blocks.{layer_num}.decoder_layer.{'.'.join(remain_names)}"
-    else:
-        target_key = key
-    save_state_dict_1[f"{target_key}"] = state_dict[key]
-load_msg = model.lang_encoder.load_state_dict(
-    save_state_dict_1,
-    False,
-)
-# load flamingo's vision encoder from last checkpoint.
-# you can visit https://huggingface.co/luodian/openflamingo-9b-hf/tree/main to download the checkpoint.
-AZP = os.environ["AZP"]
-state_dict_3 = torch.load(f"{AZP}/pytorch_model-00004-of-00004.bin", map_location="cpu")
-for cur_key in list(state_dict_3.keys()):
-    if "vision_encoder" not in cur_key:
-        del state_dict_3[cur_key]
-load_msg = model.load_state_dict(
-    state_dict_3,
-    False,
-)
-# print incompatible keys
-print(load_msg[1])
-save_state_dict_1 = {}
-for key in state_dict:
-    if ".blocks." in key:
-        _, _, layer_num, *remain_names = key.split(".")
-        target_key = f"transformer.blocks.{layer_num}.decoder_layer.{'.'.join(remain_names)}"
-    else:
-        target_key = key
-    save_state_dict_1[f"{target_key}"] = state_dict[key]
-load_msg = model.lang_encoder.load_state_dict(
-    save_state_dict_1,
-    False,
-)
-# print incompatible keys
-print(load_msg[1])
-if args.flamingo_dir is not None:
-    state_dict_2 = torch.load(f"{args.flamingo_dir}/checkpoint.pt", map_location="cpu")
-    save_state_dict_2 = rename_flamingo_checkpoint(state_dict_2)
-    real_vocab_size = config.text_config.vocab_size
-    # Reshape the token embedding to 50280 for compatible
-    model.lang_encoder.resize_token_embeddings(save_state_dict_2["lang_encoder.transformer.wte.weight"].shape[0])
-    load_msg = model.load_state_dict(
-        save_state_dict_2,
-        False,
-    )
-    # print incompatible keys
-    print(load_msg[1])
-    # Reshape the token embedding to 50432
-    model.lang_encoder.resize_token_embeddings(real_vocab_size)
-print(f"Saving model to {save_path}...")
-model.save_pretrained(save_path, max_shard_size="10GB")

mllm/flamingo/injecting_mpt_into_flamingo.py DELETED Viewed

@@ -1,109 +0,0 @@
-import argparse
-import os
-import torch
-from tqdm import tqdm
-import sys
-from configuration_flamingo import FlamingoConfig
-from modeling_flamingo import FlamingoForConditionalGeneration
-from utils import rename_flamingo_checkpoint
-parser = argparse.ArgumentParser(description="Convert MPT model")
-parser.add_argument("--model_choice", type=str, choices=["7B", "30B"], required=True, help="Choose either '7B' or '30B'")
-parser.add_argument("--mpt_root_dir", type=str, default="/home/luodian/projects/checkpoints")
-parser.add_argument("--save_root_dir", type=str, default="/home/luodian/projects/checkpoints")
-parser.add_argument("--flamingo_dir", type=str, default=None, help="If the pretrained flamingo weights also need to be injected")
-args = parser.parse_args()
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-root_dir = args.mpt_root_dir
-model_choice = args.model_choice
-save_root_dir = args.save_root_dir
-# prepare mpt model at first
-# you can visit https://huggingface.co/mosaicml to download 7B and 30B instruct checkpoints.
-if model_choice == "30B":
-    config_file = "./flamingo/flamingo-mpt-30B.json"
-    state_dict_files = [
-        f"{root_dir}/mpt-30b-instruct/pytorch_model-00001-of-00007.bin",
-        f"{root_dir}/mpt-30b-instruct/pytorch_model-00002-of-00007.bin",
-        f"{root_dir}/mpt-30b-instruct/pytorch_model-00003-of-00007.bin",
-        f"{root_dir}/mpt-30b-instruct/pytorch_model-00004-of-00007.bin",
-        f"{root_dir}/mpt-30b-instruct/pytorch_model-00005-of-00007.bin",
-        f"{root_dir}/mpt-30b-instruct/pytorch_model-00006-of-00007.bin",
-        f"{root_dir}/mpt-30b-instruct/pytorch_model-00007-of-00007.bin",
-    ]
-    save_path = f"{save_root_dir}/flamingo-mpt-30B-instruct-init"
-elif model_choice == "7B":
-    config_file = "./flamingo/flamingo-mpt-7B.json"
-    state_dict_files = [
-        f"{root_dir}/mpt-7b/pytorch_model-00001-of-00002.bin",
-        f"{root_dir}/mpt-7b/pytorch_model-00002-of-00002.bin",
-    ]
-    save_path = f"{save_root_dir}/flamingo-mpt-7B"
-else:
-    raise ValueError("Invalid model_choice. Choose either '30B' or '7B'.")
-config = FlamingoConfig.from_json_file(config_file)
-model = FlamingoForConditionalGeneration(config=config)
-# load flamingo's vision encoder from last checkpoint.
-# you can visit https://huggingface.co/luodian/openflamingo-9b-hf/tree/main to download the checkpoint.
-AZP = os.environ["AZP"]
-state_dict_3 = torch.load(f"{AZP}/otter/checkpoints/flamingo_9b_hf/pytorch_model-00004-of-00004.bin", map_location="cpu")
-for cur_key in list(state_dict_3.keys()):
-    if "vision_encoder" not in cur_key:
-        del state_dict_3[cur_key]
-load_msg = model.load_state_dict(
-    state_dict_3,
-    False,
-)
-# print incompatible keys
-print(load_msg[1])
-# Loading mpt weights
-state_dict = {}
-for file in tqdm(state_dict_files, desc="Loading state dict"):
-    state_dict_part = torch.load(file, map_location="cpu")
-    state_dict.update(state_dict_part)
-save_state_dict_1 = {}
-for key in state_dict:
-    if ".blocks." in key:
-        _, _, layer_num, *remain_names = key.split(".")
-        target_key = f"transformer.blocks.{layer_num}.decoder_layer.{'.'.join(remain_names)}"
-    else:
-        target_key = key
-    save_state_dict_1[f"{target_key}"] = state_dict[key]
-load_msg = model.lang_encoder.load_state_dict(
-    save_state_dict_1,
-    False,
-)
-# print incompatible keys
-print(load_msg[1])
-if args.flamingo_dir is not None:
-    state_dict_2 = torch.load(f"{args.flamingo_dir}/checkpoint.pt", map_location="cpu")
-    save_state_dict_2 = rename_flamingo_checkpoint(state_dict_2)
-    real_vocab_size = config.text_config.vocab_size
-    # Reshape the token embedding to 50280 for compatible
-    model.lang_encoder.resize_token_embeddings(save_state_dict_2["lang_encoder.transformer.wte.weight"].shape[0])
-    load_msg = model.load_state_dict(
-        save_state_dict_2,
-        False,
-    )
-    # print incompatible keys
-    print(load_msg[1])
-    # Reshape the token embedding to 50432
-    model.lang_encoder.resize_token_embeddings(real_vocab_size)
-print(f"Saving model to {save_path}...")
-model.save_pretrained(save_path, max_shard_size="10GB")

mllm/flamingo/injecting_vicuna_into_flamingo.py DELETED Viewed

@@ -1,100 +0,0 @@
-import argparse
-import os
-import torch
-from tqdm import tqdm
-import sys
-from .configuration_flamingo import FlamingoConfig
-from .modeling_flamingo import FlamingoForConditionalGeneration
-# from .configuration_flamingo import FlamingoConfig
-# from .modeling_flamingo import FlamingoForConditionalGeneration
-parser = argparse.ArgumentParser(description="Convert Vicuna model")
-parser.add_argument("--model_choice", type=str, choices=["7B", "33B"], required=True, help="Choose either '7B' or '33B'")
-parser.add_argument("--vicuna_root_dir", type=str, default="/home/luodian/projects/checkpoints")
-parser.add_argument("--save_root_dir", type=str, default="/home/luodian/projects/checkpoints")
-parser.add_argument("--flamingo_dir", type=str, default=None, help="If the pretrained flamingo weights also need to be injected")
-args = parser.parse_args()
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-root_dir = args.vicuna_root_dir
-model_choice = args.model_choice
-save_root_dir = args.save_root_dir
-# prepare vicuna model at first
-# you can visit https://huggingface.co/lmsys/vicuna-33b-v1.3 to download 7B and 30B instruct checkpoints.
-if model_choice == "33B":
-    config_file = "./flamingo/flamingo-vicuna-33B-v1.3.json"
-    state_dict_files = [
-        f"{root_dir}/vicuna-33b-v1.3/pytorch_model-00001-of-00007.bin",
-        f"{root_dir}/vicuna-33b-v1.3/pytorch_model-00002-of-00007.bin",
-        f"{root_dir}/vicuna-33b-v1.3/pytorch_model-00003-of-00007.bin",
-        f"{root_dir}/vicuna-33b-v1.3/pytorch_model-00004-of-00007.bin",
-        f"{root_dir}/vicuna-33b-v1.3/pytorch_model-00005-of-00007.bin",
-        f"{root_dir}/vicuna-33b-v1.3/pytorch_model-00006-of-00007.bin",
-        f"{root_dir}/vicuna-33b-v1.3/pytorch_model-00007-of-00007.bin",
-    ]
-    save_path = f"{save_root_dir}/flamingo-vicuna-33B-v1.3-init"
-elif model_choice == "7B":
-    config_file = "./flamingo/flamingo-vicuna-7B-v1.3.json"
-    state_dict_files = [
-        f"{root_dir}/vicuna-7b-v1.3/pytorch_model-00001-of-00002.bin",
-        f"{root_dir}/vicuna-7b-v1.3/pytorch_model-00002-of-00002.bin",
-    ]
-    save_path = f"{save_root_dir}/flamingo-vicuna-7B-v1.3-init"
-else:
-    raise ValueError("Invalid model_choice. Choose either '33B' or '7B'.")
-config = FlamingoConfig.from_json_file(config_file)
-model = FlamingoForConditionalGeneration(config=config)
-# load flamingo's vision encoder from last checkpoint.
-# you can visit https://huggingface.co/luodian/openflamingo-9b-hf/tree/main to download the checkpoint.
-# AZP = "os.environ["AZP"]"
-AZP = os.environ["AZP"]
-state_dict_3 = torch.load(f"{AZP}/otter/checkpoints/flamingo_9b_hf/pytorch_model-00004-of-00004.bin", map_location="cpu")
-for cur_key in list(state_dict_3.keys()):
-    if "vision_encoder" not in cur_key:
-        del state_dict_3[cur_key]
-load_msg = model.load_state_dict(
-    state_dict_3,
-    False,
-)
-# print incompatible keys
-print(load_msg[1])
-# Loading vicuna weights
-state_dict = {}
-for file in tqdm(state_dict_files, desc="Loading state dict"):
-    state_dict_part = torch.load(file, map_location="cpu")
-    state_dict.update(state_dict_part)
-save_state_dict_1 = {}
-for key in state_dict:
-    if ".layers." in key:
-        _, _, layer_num, *remain_names = key.split(".")
-        target_key = f"model.layers.{layer_num}.decoder_layer.{'.'.join(remain_names)}"
-    else:
-        target_key = key
-    save_state_dict_1[f"{target_key}"] = state_dict[key]
-# Reshape the token embedding to 50280 for compatible
-model.lang_encoder.resize_token_embeddings(32000)
-load_msg = model.lang_encoder.load_state_dict(
-    save_state_dict_1,
-    False,
-)
-# Reshape the token embedding to 32002 for compatible
-model.lang_encoder.resize_token_embeddings(32002)
-# print incompatible keys
-print(load_msg[1])
-print(f"Saving model to {save_path}...")
-model.save_pretrained(save_path, max_shard_size="10GB")

mllm/flamingo/modeling_flamingo.py DELETED Viewed

@@ -1,966 +0,0 @@
-import random
-from dataclasses import dataclass
-from typing import Callable, Optional
-import torch
-import torch.nn as nn
-from accelerate.hooks import AlignDevicesHook, add_hook_to_module
-from einops import rearrange, repeat
-from transformers import CLIPVisionModel, LlamaForCausalLM, LlamaTokenizer
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.models.auto import AutoModel, AutoModelForCausalLM, AutoTokenizer
-from .configuration_flamingo import FlamingoConfig
-from .falcon.modelling_RW import RWForCausalLM
-from .mpt.modeling_mpt import MPTForCausalLM
-from .mpt_redpajama.mosaic_gpt import MosaicGPT
-# from .configuration_flamingo import FlamingoConfig
-__KNOWN_DECODER_LAYERS_ATTR_NAMES = {
-    "opt": "model.decoder.layers",
-    "gptneo": "transformer.h",
-    "gptj": "transformer.h",
-    "gpt-j": "transformer.h",
-    "pythia": "gpt_neox.layers",
-    "llama": "model.layers",
-    "RWForCausalLM": "transformer.h",
-    "MPTForCausalLM": "transformer.blocks",
-    "MosaicGPT": "transformer.blocks",
-}
-def _infer_decoder_layers_attr_name(model: nn.Module):
-    for k in __KNOWN_DECODER_LAYERS_ATTR_NAMES:
-        if k.lower() in model.__class__.__name__.lower():
-            return __KNOWN_DECODER_LAYERS_ATTR_NAMES[k]
-    raise ValueError(
-        f"We require the attribute name for the nn.ModuleList in the decoder storing the transformer block layers. Please supply this string manually."
-    )
-def extend_instance(obj, mixin):
-    """Apply mixins to a class instance after creation"""
-    base_cls = obj.__class__
-    base_cls_name = obj.__class__.__name__
-    obj.__class__ = type(base_cls_name, (mixin, base_cls), {})  # mixin needs to go first for our forward() logic to work
-def getattr_recursive(obj, att):
-    """
-    Return nested attribute of obj
-    Example: getattr_recursive(obj, 'a.b.c') is equivalent to obj.a.b.c
-    """
-    if att == "":
-        return obj
-    i = att.find(".")
-    if i < 0:
-        return getattr(obj, att)
-    else:
-        return getattr_recursive(getattr(obj, att[:i]), att[i + 1 :])
-def setattr_recursive(obj, att, val):
-    """
-    Set nested attribute of obj
-    Example: setattr_recursive(obj, 'a.b.c', val) is equivalent to obj.a.b.c = val
-    """
-    if "." in att:
-        obj = getattr_recursive(obj, ".".join(att.split(".")[:-1]))
-    setattr(obj, att.split(".")[-1], val)
-def exists(val):
-    return val is not None
-class FlamingoPerceiverBlock(nn.Module):
-    def __init__(self, *, dim: int, dim_head: int = 64, heads: int = 8, mult: int = 4):
-        super().__init__()
-        self.scale = dim_head**-0.5
-        self.heads = heads
-        inner_dim = dim_head * heads
-        ff_dim = dim * mult
-        self.norm_media = nn.LayerNorm(dim)
-        self.norm_latents = nn.LayerNorm(dim)
-        self.to_q = nn.Linear(dim, inner_dim, bias=False)
-        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
-        self.to_out = nn.Linear(inner_dim, dim, bias=False)
-        self.feed_forward = nn.ModuleList(
-            [
-                nn.LayerNorm(dim),
-                nn.Linear(dim, ff_dim, bias=False),
-                nn.GELU(),
-                nn.Linear(ff_dim, dim, bias=False),
-            ]
-        )
-    def forward(self, x: torch.Tensor, latents: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            x (torch.Tensor): image features
-                shape (b, T, n1, D)
-            latent (torch.Tensor): latent features
-                shape (b, T, n2, D)
-        """
-        x = self.norm_media(x)
-        residual_latents = latents
-        latents = self.norm_latents(latents)
-        h = self.heads
-        q = self.to_q(latents)
-        kv_input = torch.cat((x, latents), dim=-2)
-        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
-        q = rearrange(q, "b t n (h d) -> b h t n d", h=h)
-        k = rearrange(k, "b t n (h d) -> b h t n d", h=h)
-        v = rearrange(v, "b t n (h d) -> b h t n d", h=h)
-        q = q * self.scale
-        # attention
-        sim = torch.einsum("... i d, ... j d  -> ... i j", q, k)
-        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
-        attn = sim.softmax(dim=-1)
-        out = torch.einsum("... i j, ... j d -> ... i d", attn, v)
-        out = rearrange(out, "b h t n d -> b t n (h d)", h=h)
-        out = self.to_out(out) + residual_latents
-        residual_out = out
-        for layer in self.feed_forward:
-            out = layer(out)
-        return out + residual_out
-class FlamingoPerceiverResampler(nn.Module):
-    def __init__(
-        self,
-        *,
-        dim: int,
-        depth: int = 6,
-        dim_head: int = 64,
-        heads: int = 8,
-        num_latents: int = 64,
-        # max_num_frames: int = 128,
-        max_num_media: Optional[int] = None,
-        max_num_frames: Optional[int] = None,
-        ff_mult: int = 4,
-    ):
-        super().__init__()
-        self.latents = nn.Parameter(torch.randn(num_latents, dim))
-        self.frame_embs = nn.Parameter(torch.randn(max_num_frames, dim)) if exists(max_num_frames) else None
-        # self.frame_embs = nn.Parameter(torch.randn(max_num_frames, dim))
-        self.media_time_embs = nn.Parameter(torch.randn(max_num_media, 1, dim)) if exists(max_num_media) else None
-        self.layers = nn.ModuleList([])
-        for _ in range(depth):
-            self.layers.append(FlamingoPerceiverBlock(dim=dim, dim_head=dim_head, heads=heads, mult=ff_mult))
-        self.norm = nn.LayerNorm(dim)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            x (torch.Tensor): image features
-                shape (b, T, F, v, D)
-        Returns:
-            shape (b, T, n, D) where n is self.num_latents
-        """
-        b, T, F, v = x.shape[:4]
-        # frame and media time embeddings
-        if exists(self.frame_embs):
-            frame_embs = repeat(self.frame_embs[:F], "F d -> b T F v d", b=b, T=T, v=v)
-            x = x + frame_embs
-        x = rearrange(x, "b T F v d -> b T (F v) d")  # flatten the frame and spatial dimensions
-        if exists(self.media_time_embs):
-            x = x + self.media_time_embs[:T]
-        # blocks
-        latents = repeat(self.latents, "n d -> b T n d", b=b, T=T)
-        for block in self.layers:
-            latents = block(x, latents)
-        return self.norm(latents)
-class FlamingoMaskedCrossAttention(nn.Module):
-    def __init__(
-        self,
-        *,
-        dim: int,
-        dim_visual: int,
-        dim_head: int = 64,
-        heads: int = 8,
-        only_attend_immediate_media: bool = True,
-    ):
-        super().__init__()
-        self.scale = dim_head**-0.5
-        self.heads = heads
-        inner_dim = dim_head * heads
-        self.norm = nn.LayerNorm(dim)
-        self.to_q = nn.Linear(dim, inner_dim, bias=False)
-        self.to_kv = nn.Linear(dim_visual, inner_dim * 2, bias=False)
-        self.to_out = nn.Linear(inner_dim, dim, bias=False)
-        # whether for text to only attend to immediate preceding image, or all previous images
-        self.only_attend_immediate_media = only_attend_immediate_media
-    def forward(
-        self,
-        x: torch.Tensor,
-        media: torch.Tensor,
-        media_locations: Optional[torch.BoolTensor] = None,
-        attend_previous: bool = True,
-    ) -> torch.Tensor:
-        """
-        Args:
-            x (torch.Tensor): text features
-                shape (B, T_txt, D_txt)
-            media (torch.Tensor): image features
-                shape (B, T_img, n, D_img) where n is the dim of the latents
-            media_locations: boolean mask identifying the media tokens in x
-                shape (B, T_txt)
-            attend_previous: bool
-                If false, ignores immediately preceding image and starts attending when following image
-        """
-        _, T_img, n = media.shape[:3]
-        h = self.heads
-        x = self.norm(x)
-        q = self.to_q(x)
-        media = rearrange(media, "b t n d -> b (t n) d")
-        k, v = self.to_kv(media).chunk(2, dim=-1)
-        q = rearrange(q, "b n (h d) -> b h n d", h=h)
-        k = rearrange(k, "b n (h d) -> b h n d", h=h)
-        v = rearrange(v, "b n (h d) -> b h n d", h=h)
-        q = q * self.scale
-        sim = torch.einsum("... i d, ... j d -> ... i j", q, k)
-        if exists(media_locations):
-            # at each boolean of True, increment the time counter (relative to media time)
-            text_time = media_locations.cumsum(dim=-1)
-            media_time = torch.arange(T_img, device=x.device) + 1
-            if not attend_previous:
-                text_time[~media_locations] += 1
-                # make sure max is still the number of images in the sequence
-                text_time[
-                    text_time
-                    > repeat(
-                        torch.count_nonzero(media_locations, dim=1),
-                        "b -> b i",
-                        i=text_time.shape[1],
-                    )
-                ] = 0
-            # text time must equal media time if only attending to most immediate image
-            # otherwise, as long as text time is greater than media time (if attending to all previous images / media)
-            mask_op = torch.eq if self.only_attend_immediate_media else torch.ge
-            text_to_media_mask = mask_op(
-                rearrange(text_time, "b i -> b 1 i 1"),
-                repeat(media_time, "j -> 1 1 1 (j n)", n=n),
-            )
-            sim = sim.masked_fill(~text_to_media_mask, -torch.finfo(sim.dtype).max)
-        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
-        attn = sim.softmax(dim=-1)
-        if exists(media_locations) and self.only_attend_immediate_media:
-            # any text without a preceding media needs to have attention zeroed out
-            text_without_media_mask = text_time == 0
-            text_without_media_mask = rearrange(text_without_media_mask, "b i -> b 1 i 1")
-            attn = attn.masked_fill(text_without_media_mask, 0.0)
-        out = torch.einsum("... i j, ... j d -> ... i d", attn, v)
-        out = rearrange(out, "b h n d -> b n (h d)")
-        return self.to_out(out)
-class FlamingoGatedCrossAttentionBlock(nn.Module):
-    def __init__(
-        self,
-        *,
-        dim: int,
-        dim_visual: int,
-        dim_head: int = 64,
-        heads: int = 8,
-        ff_mult: int = 4,
-        only_attend_immediate_media: bool = True,
-    ):
-        super().__init__()
-        self.attn = FlamingoMaskedCrossAttention(
-            dim=dim,
-            dim_visual=dim_visual,
-            dim_head=dim_head,
-            heads=heads,
-            only_attend_immediate_media=only_attend_immediate_media,
-        )
-        self.attn_gate = nn.Parameter(torch.tensor([0.0]))
-        self.feed_forward = nn.ModuleList(
-            [
-                nn.LayerNorm(dim),
-                nn.Linear(dim, dim * ff_mult, bias=False),
-                nn.GELU(),
-                nn.Linear(dim * ff_mult, dim, bias=False),
-            ]
-        )
-        self.ff_gate = nn.Parameter(torch.tensor([0.0]))
-    def forward(
-        self,
-        x: torch.Tensor,
-        media: torch.Tensor,
-        media_locations: Optional[torch.BoolTensor] = None,
-        attend_previous: bool = True,
-    ) -> torch.Tensor:
-        x = (
-            self.attn(
-                x,
-                media,
-                media_locations=media_locations,
-                attend_previous=attend_previous,
-            )
-            * self.attn_gate.tanh()
-            + x
-        )
-        residual_x = x
-        for ff in self.feed_forward:
-            x = ff(x)
-        x = x * self.ff_gate.tanh() + residual_x
-        return x
-class FlamingoLayer(nn.Module):
-    def __init__(self, gated_cross_attn_layer: nn.Module, decoder_layer: nn.Module):
-        super().__init__()
-        self.gated_cross_attn_layer = gated_cross_attn_layer
-        self.decoder_layer = decoder_layer
-        self.vis_x = None
-        self.media_locations = None
-    def is_conditioned(self) -> bool:
-        """Check whether the layer is conditioned."""
-        return self.vis_x is not None
-    # Used this great idea from this implementation of Flamingo (https://github.com/dhansmair/flamingo-mini/)
-    def condition_vis_x(self, vis_x) -> None:
-        self.vis_x = vis_x
-    def condition_media_locations(self, media_locations) -> None:
-        self.media_locations = media_locations
-    def condition_attend_previous(self, attend_previous) -> None:
-        self.attend_previous = attend_previous
-    def forward(
-        self,
-        lang_x: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        **decoder_layer_kwargs,
-    ):
-        if self.gated_cross_attn_layer is None:
-            return self.decoder_layer(lang_x, attention_mask=attention_mask, **decoder_layer_kwargs)
-        if self.vis_x is None:
-            raise ValueError("vis_x must be conditioned before forward pass")
-        if self.media_locations is None:
-            raise ValueError("media_locations must be conditioned before forward pass")
-        lang_x = self.gated_cross_attn_layer(
-            lang_x,
-            self.vis_x,
-            media_locations=self.media_locations,
-            attend_previous=self.attend_previous,
-        )
-        lang_x = self.decoder_layer(lang_x, attention_mask=attention_mask, **decoder_layer_kwargs)
-        return lang_x
-class FlamingoLMMixin(nn.Module):
-    """
-    Mixin to add cross-attention layers to a language model.
-    """
-    def set_decoder_layers_attr_name(self, decoder_layers_attr_name):
-        self.decoder_layers_attr_name = decoder_layers_attr_name
-    def _get_decoder_layers(self):
-        return getattr_recursive(self, self.decoder_layers_attr_name)
-    def _set_decoder_layers(self, value):
-        setattr_recursive(self, self.decoder_layers_attr_name, value)
-    def init_flamingo(
-        self,
-        media_token_id: int,
-        vis_hidden_size: int,
-        cross_attn_every_n_layers: int,
-        use_media_placement_augmentation: bool,
-    ):
-        """
-        Initialize Flamingo by adding a new gated cross attn to the decoder. Store the media token id for computing the media locations.
-        """
-        gated_cross_attn_layers = nn.ModuleList(
-            [
-                FlamingoGatedCrossAttentionBlock(
-                    dim=self.config.hidden_size,
-                    dim_visual=vis_hidden_size,
-                )
-                if (layer_idx + 1) % cross_attn_every_n_layers == 0
-                else None
-                for layer_idx, _ in enumerate(self._get_decoder_layers())
-            ]
-        )
-        self._set_decoder_layers(
-            nn.ModuleList(
-                [
-                    FlamingoLayer(gated_cross_attn_layer, decoder_layer)
-                    for gated_cross_attn_layer, decoder_layer in zip(gated_cross_attn_layers, self._get_decoder_layers())
-                ]
-            )
-        )
-        self.media_token_id = media_token_id
-        self.use_media_placement_augmentation = use_media_placement_augmentation
-        self.initialized_flamingo = True
-    def forward(self, *input, **kwargs):
-        """Condition the Flamingo layers on the media locations before forward()"""
-        if not self.initialized_flamingo:
-            raise ValueError("Flamingo layers are not initialized. Please call `init_flamingo` first.")
-        input_ids = kwargs["input_ids"] if "input_ids" in kwargs else input[0]
-        media_locations = input_ids == self.media_token_id
-        # IMPORTANT: Force `attend_previous` to True when we place training data as <image>caption<|endofchunk|>
-        # attend_previous = (
-        #     (random.random() < 0.5) if self.use_media_placement_augmentation else False
-        # )
-        attend_previous = (random.random() < 0.5) if self.use_media_placement_augmentation else True
-        # attend_previous = self.only_attend_previous
-        if self.__class__.__name__ == "LlamaForCausalLM":
-            for layer in self.get_decoder().layers:
-                layer.condition_media_locations(media_locations)
-                layer.condition_attend_previous(attend_previous)
-        elif self.__class__.__name__ in ["MPTForCausalLM", "MosaicGPT"]:
-            for layer in self.get_decoder().blocks:
-                layer.condition_media_locations(media_locations)
-                layer.condition_attend_previous(attend_previous)
-        else:
-            print("inavaliable text encoder")
-        return super().forward(*input, **kwargs)  # Call the other parent's forward method
-    def is_conditioned(self) -> bool:
-        """Check whether all decoder layers are already conditioned."""
-        return all(l.is_conditioned() for l in self._get_decoder_layers())
-    def clear_conditioned_layers(self) -> None:
-        for layer in self._get_decoder_layers():
-            layer.condition_vis_x(None)
-            layer.condition_media_locations(None)
-            layer.condition_attend_previous(None)
-class FlamingoPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-    config_class = FlamingoConfig
-    base_model_prefix = "flamingo"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["FlamingoPerceiverBlock", "CLIPEncoderLayer", "FlamingoLayer"]
-    def _init_weights(self, module):
-        """Flamingo requires no specific initialization"""
-        return super()._init_weights(module)
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, FlamingoModel):
-            module.gradient_checkpointing = value
-class FlamingoModel(FlamingoPreTrainedModel):
-    config_class = FlamingoConfig
-    def __init__(
-        self,
-        config: FlamingoConfig,
-    ):
-        super().__init__(config)
-        ### TODO: give "LlamaForCausalLM" as the name of text_config.architectures of Llama_based flamingo
-        if "llama" not in config.text_config._name_or_path:
-            if config.text_config.architectures[0] == "MPTForCausalLM":
-                text_tokenizer = AutoTokenizer.from_pretrained("mosaicml/mpt-7b-instruct")
-                lang_encoder = MPTForCausalLM(config=config.text_config)
-            elif config.text_config.text_config.architectures[0] == "MosaicGPT":
-                text_tokenizer = AutoTokenizer.from_pretrained("mosaicml/mosaic-llama-redpajama-final-candidate")
-                lang_encoder = MosaicGPT(config=config.text_config)
-            elif config.text_config.architectures[0] == "RWForCausalLM":
-                text_tokenizer = AutoTokenizer.from_pretrained("PATH-TO-YOUR-FALCON")
-                lang_encoder = RWForCausalLM(config=config.text_config)
-        else:
-            text_tokenizer = LlamaTokenizer.from_pretrained(config.text_config._name_or_path)
-            lang_encoder = LlamaForCausalLM(config=config.text_config)
-        vision_encoder = CLIPVisionModel(config=config.vision_config)
-        text_tokenizer.add_special_tokens({"additional_special_tokens": ["<|endofchunk|>", "<image>"]})
-        if text_tokenizer.pad_token is None:
-            text_tokenizer.add_special_tokens({"pad_token": "<PAD>"})
-        self.text_tokenizer = text_tokenizer
-        self.eoc_token_id = text_tokenizer.encode("<|endofchunk|>")[-1]
-        self.media_token_id = text_tokenizer.encode("<image>")[-1]
-        extend_instance(lang_encoder, FlamingoLMMixin)
-        decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
-        lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
-        if lang_encoder.__class__.__name__ == "LlamaForCausalLM":
-            lang_encoder.resize_token_embeddings(len(text_tokenizer))
-        self.lang_encoder = lang_encoder
-        self.cross_attn_every_n_layers = config.cross_attn_every_n_layers if hasattr(config, "cross_attn_every_n_layers") else 4
-        self.use_media_placement_augmentation = config.use_media_placement_augmentation
-        vision_encoder.output_tokens = True
-        self.vision_encoder = vision_encoder
-        self.vis_dim = 1024
-        self.perceiver = FlamingoPerceiverResampler(dim=self.vis_dim)
-        self.lang_encoder.init_flamingo(
-            media_token_id=self.media_token_id,
-            vis_hidden_size=self.vis_dim,
-            cross_attn_every_n_layers=self.cross_attn_every_n_layers,
-            use_media_placement_augmentation=self.use_media_placement_augmentation,
-        )
-        self.post_init()
-    def get_input_embeddings(self) -> nn.Module:
-        return self.lang_encoder.get_input_embeddings()
-    def set_input_embeddings(self, new_embeddings):
-        self.lang_encoder.set_input_embeddings(new_embeddings)
-    def get_output_embeddings(self) -> nn.Module:
-        return self.lang_encoder.get_output_embeddings()
-    def set_output_embeddings(self, new_embeddings):
-        self.lang_encoder.set_output_embeddings(new_embeddings)
-    def get_image_encoder(self) -> nn.Module:
-        return self.vision_encoder
-    def get_lang_encoder(self) -> nn.Module:
-        return self.lang_encoder
-    # def init_weights(self):
-    #     # Freeze all parameters in vision encoder
-    #     for param in self.vision_encoder.parameters():
-    #         param.requires_grad = False
-    #     # Freeze all parameters in lang encoders except gated_cross_attn_layers
-    #     for name, param in self.lang_encoder.named_parameters():
-    #         if "gated_cross_attn_layer" not in name:
-    #             param.requires_grad = False
-    #     # Unfreeze LM input embeddings
-    #     self.lang_encoder.get_input_embeddings().requires_grad_(True)
-    #     ## MPTForCausalLM is tied word embedding
-    #     if self.lang_encoder.__class__.__name__ == "LlamaForCausalLM":
-    #         self.lang_encoder.lm_head.requires_grad_(True)
-    #     # assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0
-    #     # print model size in billions of parameters in 2 decimal places
-    #     print(f"Trainable param: {(sum(p.numel() for p in self.parameters() if p.requires_grad)) / 1e9:.2f} B")
-    def init_weights(self):
-        # Freeze all parameters in vision encoder
-        for param in self.vision_encoder.parameters():
-            param.requires_grad = False
-        if "lora_config" in self.config.__dict__:
-            print(f"LoRA trainable param: {(sum(p.numel() for p in self.lang_encoder.parameters() if p.requires_grad)) / 1e9:.3f} B")
-            # Unfreeze gated_cross_attn_layers
-            for layer in self.lang_encoder._get_decoder_layers():
-                if layer.gated_cross_attn_layer is not None:
-                    for param in layer.gated_cross_attn_layer.parameters():
-                        param.requires_grad = True
-        else:
-            # Freeze all parameters in lang encoders except gated_cross_attn_layers
-            for name, param in self.lang_encoder.named_parameters():
-                if "gated_cross_attn_layer" not in name:
-                    param.requires_grad = False
-        # Unfreeze LM input and output embeddings
-        self.lang_encoder.get_input_embeddings().requires_grad_(True)
-        ## MPTForCausalLM is tied word embedding
-        if self.lang_encoder.__class__.__name__ == "LlamaForCausalLM":
-            self.lang_encoder.lm_head.requires_grad_(True)
-        # assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0
-        # print model size in billions of parameters in 2 decimal places
-        print(f"Total Trainable param: {(sum(p.numel() for p in self.parameters() if p.requires_grad)) / 1e9:.3f} B")
-    def forward(
-        self,
-        vision_x: torch.Tensor,
-        lang_x: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cached_vision_x: bool = False,
-        clear_conditioned_layers: bool = True,
-        past_key_values: Optional[torch.Tensor] = None,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> CausalLMOutputWithPast:
-        """
-        Forward pass of Flamingo.
-        Args:
-            vision_x (torch.Tensor): Vision input
-                shape (B, T_img, F, C, H, W) with F=1
-            lang_x (torch.Tensor): Language input ids
-                shape (B, T_txt)
-            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
-            labels (torch.Tensor, optional): Labels. Defaults to None.
-            clear_conditioned_layers: if True, clear the conditioned layers
-                once the foward pass is completed. Set this to false if the
-                same set of images will be reused in another subsequent
-                forward pass.
-            past_key_values: pre-computed values to pass to language model.
-                See past_key_values documentation in Hugging Face
-                CausalLM models.
-            use_cache: whether to use cached key values. See use_cache
-                documentation in Hugging Face CausalLM models.
-        """
-        assert (vision_x is not None) or use_cached_vision_x, "Must provide either vision_x or use_cached_vision_x to True."
-        if use_cached_vision_x:
-            # Case: use cached; vision_x should be cached and other
-            # vision-related inputs should not be provided.
-            assert vision_x is None, "Expect vision_x to be None when use_cached_vision_x is True."
-            assert self.lang_encoder.is_conditioned()
-        else:
-            # Case: do not use caching (i.e. this is a standard forward pass);
-            self._encode_vision_x(vision_x=vision_x)
-        output = self.lang_encoder(
-            input_ids=lang_x,
-            attention_mask=attention_mask,
-            labels=labels,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            **kwargs,
-        )
-        if clear_conditioned_layers:
-            self.lang_encoder.clear_conditioned_layers()
-        return output
-    def _encode_vision_x(self, vision_x: torch.Tensor):
-        """
-        Compute media tokens from vision input by passing it through vision encoder and conditioning language model.
-        Args:
-            vision_x (torch.Tensor): Vision input
-                shape (B, T_img, F, C, H, W)
-                Images in the same chunk are collated along T_img, and frames are collated along F
-                Currently only F=1 is supported (single-frame videos)
-        rearrange code based on https://github.com/dhansmair/flamingo-mini
-        """
-        assert vision_x.ndim == 6, "vision_x should be of shape (b, T_img, F, C, H, W)"
-        b, T, F = vision_x.shape[:3]
-        assert F == 1, "Only single frame supported"
-        vision_x = rearrange(vision_x, "b T F c h w -> (b T F) c h w")
-        with torch.no_grad():
-            vision_x = self.vision_encoder(vision_x)[0][:, 1:, :]
-        vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F)
-        vision_x = self.perceiver(vision_x)  # reshapes to (b, T, n, d)
-        for layer in self.lang_encoder._get_decoder_layers():
-            layer.condition_vis_x(vision_x)
-class FlamingoForConditionalGeneration(FlamingoPreTrainedModel):
-    config_class = FlamingoConfig
-    def __init__(
-        self,
-        config: FlamingoConfig,
-    ):
-        super().__init__(config)
-        # TODO: hardcode right because autoXXX is too slow
-        # vision_encoder = AutoModel.from_config(config.vision_config).vision_model
-        # lang_encoder = AutoModelForCausalLM.from_config(config.text_config)
-        # text_tokenizer = AutoTokenizer.from_pretrained(config.text_config._name_or_path)
-        ### TODO: give "LlamaForCausalLM" as the name of text_config.architectures of Llama_based flamingo
-        # assert hasattr(config.text_config, "_name_or_path")
-        # if "llama" not in config.text_config._name_or_path.lower():
-        if config.text_config.architectures[0] == "MPTForCausalLM":
-            text_tokenizer = AutoTokenizer.from_pretrained("mosaicml/mpt-7b-instruct")
-            lang_encoder = MPTForCausalLM(config=config.text_config)
-        elif config.text_config.architectures[0] == "MosaicGPT":
-            text_tokenizer = AutoTokenizer.from_pretrained("mosaicml/mosaic-llama-redpajama-final-candidate")
-            lang_encoder = MosaicGPT(config=config.text_config)
-        elif config.text_config.architectures[0] == "RWForCausalLM":
-            text_tokenizer = AutoTokenizer.from_pretrained("PATH-TO-YOUR-FALCON")
-            lang_encoder = RWForCausalLM(config=config.text_config)
-        # TODO: what's the logic here?
-        elif config.text_config.architectures[0] == "LlamaForCausalLM":
-            text_tokenizer = LlamaTokenizer.from_pretrained(config.text_config._name_or_path)
-            lang_encoder = LlamaForCausalLM(config=config.text_config)
-        else:
-            import pdb
-            pdb.set_trace()
-        # else:
-        #     text_tokenizer = LlamaTokenizer.from_pretrained(config.text_config._name_or_path)
-        #     lang_encoder = LlamaForCausalLM(config=config.text_config)
-        vision_encoder = CLIPVisionModel(config=config.vision_config)
-        text_tokenizer.add_special_tokens({"additional_special_tokens": ["<|endofchunk|>", "<image>"]})
-        if text_tokenizer.pad_token is None:
-            text_tokenizer.add_special_tokens({"pad_token": "<PAD>"})
-        self.text_tokenizer = text_tokenizer
-        self.eoc_token_id = text_tokenizer.encode("<|endofchunk|>")[-1]
-        self.media_token_id = text_tokenizer.encode("<image>")[-1]
-        extend_instance(lang_encoder, FlamingoLMMixin)
-        decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
-        lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
-        if "LlamaForCausalLM" in lang_encoder.__class__.__name__:
-            lang_encoder.resize_token_embeddings(len(text_tokenizer))
-        self.lang_encoder = lang_encoder
-        self.cross_attn_every_n_layers = config.cross_attn_every_n_layers if hasattr(config, "cross_attn_every_n_layers") else 4
-        self.use_media_placement_augmentation = config.use_media_placement_augmentation
-        vision_encoder.output_tokens = True
-        self.vision_encoder = vision_encoder
-        self.vis_dim = 1024
-        self.perceiver = FlamingoPerceiverResampler(dim=self.vis_dim)
-        self.lang_encoder.init_flamingo(
-            media_token_id=self.media_token_id,
-            vis_hidden_size=self.vis_dim,
-            cross_attn_every_n_layers=self.cross_attn_every_n_layers,
-            use_media_placement_augmentation=self.use_media_placement_augmentation,
-        )
-        self.post_init()
-    def get_input_embeddings(self) -> nn.Module:
-        return self.lang_encoder.get_input_embeddings()
-    def set_input_embeddings(self, new_embeddings):
-        self.lang_encoder.set_input_embeddings(new_embeddings)
-    def get_output_embeddings(self) -> nn.Module:
-        return self.lang_encoder.get_output_embeddings()
-    def set_output_embeddings(self, new_embeddings):
-        self.lang_encoder.set_output_embeddings(new_embeddings)
-    def get_image_encoder(self) -> nn.Module:
-        return self.vision_encoder
-    def get_lang_encoder(self) -> nn.Module:
-        return self.lang_encoder
-    def init_weights(self):
-        # Freeze all parameters in vision encoder
-        for param in self.vision_encoder.parameters():
-            param.requires_grad = False
-        # Freeze all parameters in lang encoders except gated_cross_attn_layers
-        for name, param in self.lang_encoder.named_parameters():
-            if "gated_cross_attn_layer" not in name:
-                param.requires_grad = False
-        # Unfreeze LM input embeddings
-        self.lang_encoder.get_input_embeddings().requires_grad_(True)
-        ## MPTForCausalLM is tied word embedding
-        if "LlamaForCausalLM" in self.lang_encoder.__class__.__name__:
-            self.lang_encoder.lm_head.requires_grad_(True)
-        # assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0
-        # print model size in billions of parameters in 2 decimal places
-        print("====================Model Grad Part====================")
-        total_params = 0
-        for name, param in self.named_parameters():
-            if param.requires_grad:
-                total_params += param.numel()
-                print(f"Parameter: {name}, Size: {param.numel() / 1e6:.6f} M")
-        print(f"Total Trainable param: {total_params / 1e9:.4f} B")
-        print(f"Total Trainable param: {(sum(p.numel() for p in self.parameters() if p.requires_grad)) / 1e9:.3f} B")
-    def forward(
-        self,
-        vision_x: torch.Tensor,
-        lang_x: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cached_vision_x: bool = False,
-        clear_conditioned_layers: bool = True,
-        past_key_values: Optional[torch.Tensor] = None,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> CausalLMOutputWithPast:
-        """
-        Forward pass of Flamingo.
-        Args:
-            vision_x (torch.Tensor): Vision input
-                shape (B, T_img, F, C, H, W) with F=1
-            lang_x (torch.Tensor): Language input ids
-                shape (B, T_txt)
-            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
-            labels (torch.Tensor, optional): Labels. Defaults to None.
-            clear_conditioned_layers: if True, clear the conditioned layers
-                once the foward pass is completed. Set this to false if the
-                same set of images will be reused in another subsequent
-                forward pass.
-            past_key_values: pre-computed values to pass to language model.
-                See past_key_values documentation in Hugging Face
-                CausalLM models.
-            use_cache: whether to use cached key values. See use_cache
-                documentation in Hugging Face CausalLM models.
-        """
-        assert (vision_x is not None) or use_cached_vision_x, "Must provide either vision_x or use_cached_vision_x to True."
-        if use_cached_vision_x:
-            # Case: use cached; vision_x should be cached and other
-            # vision-related inputs should not be provided.
-            assert vision_x is None, "Expect vision_x to be None when use_cached_vision_x is True."
-            assert self.lang_encoder.is_conditioned()
-        else:
-            # Case: do not use caching (i.e. this is a standard forward pass);
-            self._encode_vision_x(vision_x=vision_x)
-        output = self.lang_encoder(
-            input_ids=lang_x,
-            attention_mask=attention_mask,
-            labels=labels,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            **kwargs,
-        )
-        if clear_conditioned_layers:
-            self.lang_encoder.clear_conditioned_layers()
-        return output
-    def _encode_vision_x(self, vision_x: torch.Tensor):
-        """
-        Compute media tokens from vision input by passing it through vision encoder and conditioning language model.
-        Args:
-            vision_x (torch.Tensor): Vision input
-                shape (B, T_img, F, C, H, W)
-                Images in the same chunk are collated along T_img, and frames are collated along F
-                Currently only F=1 is supported (single-frame videos)
-        rearrange code based on https://github.com/dhansmair/flamingo-mini
-        """
-        assert vision_x.ndim == 6, "vision_x should be of shape (b, T_img, F, C, H, W)"
-        b, T, F = vision_x.shape[:3]
-        # assert F == 1, "Only single frame supported"
-        vision_x = rearrange(vision_x, "b T F c h w -> (b T F) c h w")
-        with torch.no_grad():
-            vision_x = self.vision_encoder(vision_x)[0][:, 1:, :]
-        vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F)
-        vision_x = self.perceiver(vision_x)  # reshapes to (b, T, n, d)
-        for layer in self.lang_encoder._get_decoder_layers():
-            layer.condition_vis_x(vision_x)
-    @torch.no_grad()
-    def generate(
-        self,
-        vision_x: torch.Tensor,
-        lang_x: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        num_beams: int = 1,
-        max_new_tokens: Optional[int] = None,
-        temperature: float = 1.0,
-        top_k: int = 0,
-        top_p: float = 1.0,
-        no_repeat_ngram_size: int = 0,
-        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
-        length_penalty: float = 1.0,
-        num_return_sequences: int = 1,
-        do_sample: bool = False,
-        early_stopping: bool = False,
-        **kwargs,
-    ):
-        """
-        Generate text conditioned on vision and language inputs.
-        Args:
-            vision_x (torch.Tensor): Vision input
-                shape (B, T_img, F, C, H, W)
-                images in the same chunk are collated along T_img, and frames are collated along F
-                currently only F=1 is supported (single-frame videos)
-            lang_x (torch.Tensor): Language input
-                shape (B, T_txt)
-            max_length (int, optional): Maximum length of the output. Defaults to None.
-            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
-            num_beams (int, optional): Number of beams. Defaults to 1.
-            max_new_tokens (int, optional): Maximum new tokens. Defaults to None.
-            temperature (float, optional): Temperature. Defaults to 1.0.
-            top_k (int, optional): Top k. Defaults to 0.
-            top_p (float, optional): Top p. Defaults to 1.0.
-            no_repeat_ngram_size (int, optional): No repeat ngram size. Defaults to 0.
-            length_penalty (float, optional): Length penalty. Defaults to 1.0.
-            num_return_sequences (int, optional): Number of return sequences. Defaults to 1.
-            do_sample (bool, optional): Do sample. Defaults to False.
-            early_stopping (bool, optional): Early stopping. Defaults to False.
-        Returns:
-            torch.Tensor: lang_x with generated tokens appended to it
-        """
-        if hasattr(self, "_hf_hook"):
-            # add a hook to make sure that the output of lang_encoder is mapped to the same device as the lang_x
-            hook = AlignDevicesHook(
-                execution_device=lang_x.device,
-                io_same_device=True,
-                place_submodules=False,
-            )
-            add_hook_to_module(self.lang_encoder, hook)
-        if num_beams > 1:
-            vision_x = vision_x.repeat_interleave(num_beams, dim=0)
-        self._encode_vision_x(vision_x=vision_x)
-        output = self.lang_encoder.generate(
-            lang_x,
-            attention_mask=attention_mask,
-            eos_token_id=self.eoc_token_id,
-            num_beams=num_beams,
-            max_new_tokens=max_new_tokens,
-            temperature=temperature,
-            top_k=top_k,
-            top_p=top_p,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            no_repeat_ngram_size=no_repeat_ngram_size,
-            length_penalty=length_penalty,
-            num_return_sequences=num_return_sequences,
-            do_sample=do_sample,
-            early_stopping=early_stopping,
-            **kwargs,
-        )
-        self.lang_encoder.clear_conditioned_layers()
-        return output

mllm/flamingo/mpt/__init__.py DELETED Viewed

File without changes

mllm/flamingo/mpt/__pycache__/__init__.cpython-39.pyc DELETED Viewed

Binary file (206 Bytes)

mllm/flamingo/mpt/__pycache__/attention.cpython-39.pyc DELETED Viewed

Binary file (12.2 kB)

mllm/flamingo/mpt/__pycache__/blocks.cpython-39.pyc DELETED Viewed

Binary file (2.81 kB)

mllm/flamingo/mpt/__pycache__/configuration_mpt.cpython-39.pyc DELETED Viewed

Binary file (8.76 kB)

mllm/flamingo/mpt/__pycache__/custom_embedding.cpython-39.pyc DELETED Viewed

Binary file (797 Bytes)

mllm/flamingo/mpt/__pycache__/flash_attn_triton.cpython-39.pyc DELETED Viewed

Binary file (20.9 kB)

mllm/flamingo/mpt/__pycache__/modeling_mpt.cpython-39.pyc DELETED Viewed

Binary file (15.3 kB)

mllm/flamingo/mpt/__pycache__/norm.cpython-39.pyc DELETED Viewed

Binary file (3.03 kB)

mllm/flamingo/mpt/__pycache__/param_init_fns.cpython-39.pyc DELETED Viewed

Binary file (9.14 kB)

mllm/flamingo/mpt/adapt_tokenizer.py DELETED Viewed

@@ -1,44 +0,0 @@
-from typing import Union
-from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
-Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
-NUM_SENTINEL_TOKENS: int = 100
-def adapt_tokenizer_for_denoising(tokenizer: Tokenizer):
-    """Adds sentinel tokens and padding token (if missing).
-    Expands the tokenizer vocabulary to include sentinel tokens
-    used in mixture-of-denoiser tasks as well as a padding token.
-    All added tokens are added as special tokens. No tokens are
-    added if sentinel tokens and padding token already exist.
-    """
-    sentinels_to_add = [f"<extra_id_{i}>" for i in range(NUM_SENTINEL_TOKENS)]
-    tokenizer.add_tokens(sentinels_to_add, special_tokens=True)
-    if tokenizer.pad_token is None:
-        tokenizer.add_tokens("<pad>", special_tokens=True)
-        tokenizer.pad_token = "<pad>"
-        assert tokenizer.pad_token_id is not None
-    sentinels = "".join([f"<extra_id_{i}>" for i in range(NUM_SENTINEL_TOKENS)])
-    _sentinel_token_ids = tokenizer(sentinels, add_special_tokens=False).input_ids
-    tokenizer.sentinel_token_ids = _sentinel_token_ids
-class AutoTokenizerForMOD(AutoTokenizer):
-    """AutoTokenizer + Adaptation for MOD.
-    A simple wrapper around AutoTokenizer to make instantiating
-    an MOD-adapted tokenizer a bit easier.
-    MOD-adapted tokenizers have sentinel tokens (e.g., <extra_id_0>),
-    a padding token, and a property to get the token ids of the
-    sentinel tokens.
-    """
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        """See `AutoTokenizer.from_pretrained` docstring."""
-        tokenizer = super().from_pretrained(*args, **kwargs)
-        adapt_tokenizer_for_denoising(tokenizer)
-        return tokenizer

mllm/flamingo/mpt/attention.py DELETED Viewed

@@ -1,450 +0,0 @@
-"""Attention layers."""
-import math
-import warnings
-from typing import Optional
-import torch
-import torch.nn as nn
-from einops import rearrange
-from packaging import version
-from torch import nn
-from .norm import LPLayerNorm
-def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool):
-    if original_is_causal and num_query_tokens != num_key_tokens:
-        if num_query_tokens != 1:
-            raise NotImplementedError("MPT does not support query and key with different number of tokens, unless number of query tokens is 1.")
-        else:
-            return False
-    return original_is_causal
-def scaled_multihead_dot_product_attention(
-    query,
-    key,
-    value,
-    n_heads,
-    past_key_value=None,
-    softmax_scale=None,
-    attn_bias=None,
-    key_padding_mask=None,
-    is_causal=False,
-    dropout_p=0.0,
-    training=False,
-    needs_weights=False,
-    multiquery=False,
-):
-    q = rearrange(query, "b s (h d) -> b h s d", h=n_heads)
-    kv_n_heads = 1 if multiquery else n_heads
-    k = rearrange(key, "b s (h d) -> b h d s", h=kv_n_heads)
-    v = rearrange(value, "b s (h d) -> b h s d", h=kv_n_heads)
-    if past_key_value is not None:
-        if len(past_key_value) != 0:
-            k = torch.cat([past_key_value[0], k], dim=3)
-            v = torch.cat([past_key_value[1], v], dim=2)
-        past_key_value = (k, v)
-    (b, _, s_q, d) = q.shape
-    s_k = k.size(-1)
-    if softmax_scale is None:
-        softmax_scale = 1 / math.sqrt(d)
-    attn_weight = q.matmul(k) * softmax_scale
-    if attn_bias is not None:
-        _s_q = max(0, attn_bias.size(2) - s_q)
-        _s_k = max(0, attn_bias.size(3) - s_k)
-        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
-        if attn_bias.size(-1) != 1 and attn_bias.size(-1) != s_k or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q):
-            raise RuntimeError(f"attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.")
-        attn_weight = attn_weight + attn_bias
-    min_val = torch.finfo(q.dtype).min
-    if key_padding_mask is not None:
-        if attn_bias is not None:
-            warnings.warn(
-                "Propogating key_padding_mask to the attention module "
-                + "and applying it within the attention module can cause "
-                + "unneccessary computation/memory usage. Consider integrating "
-                + "into attn_bias once and passing that to each attention "
-                + "module instead."
-            )
-        attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
-    if is_causal and (not q.size(2) == 1):
-        s = max(s_q, s_k)
-        causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
-        causal_mask = causal_mask.tril()
-        causal_mask = causal_mask.to(torch.bool)
-        causal_mask = ~causal_mask
-        causal_mask = causal_mask[-s_q:, -s_k:]
-        attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k), min_val)
-    attn_weight = torch.softmax(attn_weight, dim=-1)
-    if dropout_p:
-        attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
-    out = attn_weight.to(v.dtype).matmul(v)
-    out = rearrange(out, "b h s d -> b s (h d)")
-    if needs_weights:
-        return (out, attn_weight, past_key_value)
-    return (out, None, past_key_value)
-def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
-    for tensor in tensors:
-        if tensor.dtype not in valid_dtypes:
-            raise TypeError(f"tensor.dtype={tensor.dtype!r} must be in valid_dtypes={valid_dtypes!r}.")
-        if not tensor.is_cuda:
-            raise TypeError(f"Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r}).")
-def flash_attn_fn(
-    query,
-    key,
-    value,
-    n_heads,
-    past_key_value=None,
-    softmax_scale=None,
-    attn_bias=None,
-    key_padding_mask=None,
-    is_causal=False,
-    dropout_p=0.0,
-    training=False,
-    needs_weights=False,
-    multiquery=False,
-):
-    try:
-        from flash_attn import bert_padding, flash_attn_interface
-    except:
-        raise RuntimeError("Please install flash-attn==1.0.3.post0")
-    check_valid_inputs(query, key, value)
-    if past_key_value is not None:
-        if len(past_key_value) != 0:
-            key = torch.cat([past_key_value[0], key], dim=1)
-            value = torch.cat([past_key_value[1], value], dim=1)
-        past_key_value = (key, value)
-    if attn_bias is not None:
-        _s_q = max(0, attn_bias.size(2) - query.size(1))
-        _s_k = max(0, attn_bias.size(3) - key.size(1))
-        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
-    if attn_bias is not None:
-        raise NotImplementedError(f"attn_bias not implemented for flash attn.")
-    (batch_size, seqlen) = query.shape[:2]
-    if key_padding_mask is None:
-        key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
-    query_padding_mask = key_padding_mask[:, -query.size(1) :]
-    (query_unpad, indices_q, cu_seqlens_q, max_seqlen_q) = bert_padding.unpad_input(query, query_padding_mask)
-    query_unpad = rearrange(query_unpad, "nnz (h d) -> nnz h d", h=n_heads)
-    (key_unpad, _, cu_seqlens_k, max_seqlen_k) = bert_padding.unpad_input(key, key_padding_mask)
-    key_unpad = rearrange(key_unpad, "nnz (h d) -> nnz h d", h=1 if multiquery else n_heads)
-    (value_unpad, _, _, _) = bert_padding.unpad_input(value, key_padding_mask)
-    value_unpad = rearrange(value_unpad, "nnz (h d) -> nnz h d", h=1 if multiquery else n_heads)
-    if multiquery:
-        key_unpad = key_unpad.expand(key_unpad.size(0), n_heads, key_unpad.size(-1))
-        value_unpad = value_unpad.expand(value_unpad.size(0), n_heads, value_unpad.size(-1))
-    dropout_p = dropout_p if training else 0.0
-    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
-    output_unpad = flash_attn_interface.flash_attn_unpadded_func(
-        query_unpad,
-        key_unpad,
-        value_unpad,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale=softmax_scale,
-        causal=reset_is_causal,
-        return_attn_probs=needs_weights,
-    )
-    output = bert_padding.pad_input(rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices_q, batch_size, seqlen)
-    return (output, None, past_key_value)
-def triton_flash_attn_fn(
-    query,
-    key,
-    value,
-    n_heads,
-    past_key_value=None,
-    softmax_scale=None,
-    attn_bias=None,
-    key_padding_mask=None,
-    is_causal=False,
-    dropout_p=0.0,
-    training=False,
-    needs_weights=False,
-    multiquery=False,
-):
-    try:
-        from .flash_attn_triton import flash_attn_func
-    except:
-        _installed = False
-        if version.parse(torch.__version__) < version.parse("2.0.0"):
-            _installed = True
-            try:
-                from flash_attn.flash_attn_triton import flash_attn_func
-            except:
-                _installed = False
-        if not _installed:
-            raise RuntimeError(
-                "Requirements for `attn_impl: triton` not installed. Either (1) have a CUDA-compatible GPU and `pip install .[gpu]` if installing from llm-foundry source or `pip install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python` if installing from pypi, or (2) use torch attn model.attn_config.attn_impl=torch (torch attn_impl will be slow). Note: (1) requires you have CMake and PyTorch already installed."
-            )
-    check_valid_inputs(query, key, value)
-    if past_key_value is not None:
-        if len(past_key_value) != 0:
-            key = torch.cat([past_key_value[0], key], dim=1)
-            value = torch.cat([past_key_value[1], value], dim=1)
-        past_key_value = (key, value)
-    if attn_bias is not None:
-        _s_q = max(0, attn_bias.size(2) - query.size(1))
-        _s_k = max(0, attn_bias.size(3) - key.size(1))
-        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
-    if dropout_p:
-        raise NotImplementedError(f"Dropout not implemented for attn_impl: triton.")
-    if needs_weights:
-        raise NotImplementedError(f"attn_impl: triton cannot return attn weights.")
-    if key_padding_mask is not None:
-        warnings.warn(
-            "Propagating key_padding_mask to the attention module "
-            + "and applying it within the attention module can cause "
-            + "unnecessary computation/memory usage. Consider integrating "
-            + "into attn_bias once and passing that to each attention "
-            + "module instead."
-        )
-        (b_size, s_k) = key_padding_mask.shape[:2]
-        if attn_bias is None:
-            attn_bias = query.new_zeros(b_size, 1, 1, s_k)
-        attn_bias = attn_bias.masked_fill(~key_padding_mask.view((b_size, 1, 1, s_k)), torch.finfo(query.dtype).min)
-    query = rearrange(query, "b s (h d) -> b s h d", h=n_heads)
-    key = rearrange(key, "b s (h d) -> b s h d", h=1 if multiquery else n_heads)
-    value = rearrange(value, "b s (h d) -> b s h d", h=1 if multiquery else n_heads)
-    if multiquery:
-        key = key.expand(*key.shape[:2], n_heads, key.size(-1))
-        value = value.expand(*value.shape[:2], n_heads, value.size(-1))
-    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
-    attn_output = flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
-    output = attn_output.view(*attn_output.shape[:2], -1)
-    return (output, None, past_key_value)
-class MultiheadAttention(nn.Module):
-    """Multi-head self attention.
-    Using torch or triton attention implemetation enables user to also use
-    additive bias.
-    """
-    def __init__(
-        self,
-        d_model: int,
-        n_heads: int,
-        attn_impl: str = "triton",
-        clip_qkv: Optional[float] = None,
-        qk_ln: bool = False,
-        softmax_scale: Optional[float] = None,
-        attn_pdrop: float = 0.0,
-        low_precision_layernorm: bool = False,
-        verbose: int = 0,
-        device: Optional[str] = None,
-    ):
-        super().__init__()
-        self.attn_impl = attn_impl
-        self.clip_qkv = clip_qkv
-        self.qk_ln = qk_ln
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.softmax_scale = softmax_scale
-        if self.softmax_scale is None:
-            self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
-        self.attn_dropout_p = attn_pdrop
-        self.Wqkv = nn.Linear(self.d_model, 3 * self.d_model, device=device)
-        fuse_splits = (d_model, 2 * d_model)
-        self.Wqkv._fused = (0, fuse_splits)
-        if self.qk_ln:
-            layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
-            self.q_ln = layernorm_class(self.d_model, device=device)
-            self.k_ln = layernorm_class(self.d_model, device=device)
-        if self.attn_impl == "flash":
-            self.attn_fn = flash_attn_fn
-        elif self.attn_impl == "triton":
-            self.attn_fn = triton_flash_attn_fn
-            if verbose:
-                warnings.warn(
-                    "While `attn_impl: triton` can be faster than `attn_impl: flash` "
-                    + "it uses more memory. When training larger models this can trigger "
-                    + "alloc retries which hurts performance. If encountered, we recommend "
-                    + "using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`."
-                )
-        elif self.attn_impl == "torch":
-            self.attn_fn = scaled_multihead_dot_product_attention
-            if torch.cuda.is_available() and verbose:
-                warnings.warn(
-                    "Using `attn_impl: torch`. If your model does not use `alibi` or "
-                    + "`prefix_lm` we recommend using `attn_impl: flash` otherwise "
-                    + "we recommend using `attn_impl: triton`."
-                )
-        else:
-            raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
-        self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
-        self.out_proj._is_residual = True
-    def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
-        qkv = self.Wqkv(x)
-        if self.clip_qkv:
-            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
-        (query, key, value) = qkv.chunk(3, dim=2)
-        key_padding_mask = attention_mask
-        if self.qk_ln:
-            dtype = query.dtype
-            query = self.q_ln(query).to(dtype)
-            key = self.k_ln(key).to(dtype)
-        (context, attn_weights, past_key_value) = self.attn_fn(
-            query,
-            key,
-            value,
-            self.n_heads,
-            past_key_value=past_key_value,
-            softmax_scale=self.softmax_scale,
-            attn_bias=attn_bias,
-            key_padding_mask=key_padding_mask,
-            is_causal=is_causal,
-            dropout_p=self.attn_dropout_p,
-            training=self.training,
-            needs_weights=needs_weights,
-        )
-        return (self.out_proj(context), attn_weights, past_key_value)
-class MultiQueryAttention(nn.Module):
-    """Multi-Query self attention.
-    Using torch or triton attention implemetation enables user to also use
-    additive bias.
-    """
-    def __init__(
-        self,
-        d_model: int,
-        n_heads: int,
-        attn_impl: str = "triton",
-        clip_qkv: Optional[float] = None,
-        qk_ln: bool = False,
-        softmax_scale: Optional[float] = None,
-        attn_pdrop: float = 0.0,
-        low_precision_layernorm: bool = False,
-        verbose: int = 0,
-        device: Optional[str] = None,
-    ):
-        super().__init__()
-        self.attn_impl = attn_impl
-        self.clip_qkv = clip_qkv
-        self.qk_ln = qk_ln
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.head_dim = d_model // n_heads
-        self.softmax_scale = softmax_scale
-        if self.softmax_scale is None:
-            self.softmax_scale = 1 / math.sqrt(self.head_dim)
-        self.attn_dropout_p = attn_pdrop
-        self.Wqkv = nn.Linear(d_model, d_model + 2 * self.head_dim, device=device)
-        fuse_splits = (d_model, d_model + self.head_dim)
-        self.Wqkv._fused = (0, fuse_splits)
-        if self.qk_ln:
-            layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
-            self.q_ln = layernorm_class(d_model, device=device)
-            self.k_ln = layernorm_class(self.head_dim, device=device)
-        if self.attn_impl == "flash":
-            self.attn_fn = flash_attn_fn
-        elif self.attn_impl == "triton":
-            self.attn_fn = triton_flash_attn_fn
-            if verbose:
-                warnings.warn(
-                    "While `attn_impl: triton` can be faster than `attn_impl: flash` "
-                    + "it uses more memory. When training larger models this can trigger "
-                    + "alloc retries which hurts performance. If encountered, we recommend "
-                    + "using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`."
-                )
-        elif self.attn_impl == "torch":
-            self.attn_fn = scaled_multihead_dot_product_attention
-            if torch.cuda.is_available() and verbose:
-                warnings.warn(
-                    "Using `attn_impl: torch`. If your model does not use `alibi` or "
-                    + "`prefix_lm` we recommend using `attn_impl: flash` otherwise "
-                    + "we recommend using `attn_impl: triton`."
-                )
-        else:
-            raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
-        self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
-        self.out_proj._is_residual = True
-    def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
-        qkv = self.Wqkv(x)
-        if self.clip_qkv:
-            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
-        (query, key, value) = qkv.split([self.d_model, self.head_dim, self.head_dim], dim=2)
-        key_padding_mask = attention_mask
-        if self.qk_ln:
-            dtype = query.dtype
-            query = self.q_ln(query).to(dtype)
-            key = self.k_ln(key).to(dtype)
-        (context, attn_weights, past_key_value) = self.attn_fn(
-            query,
-            key,
-            value,
-            self.n_heads,
-            past_key_value=past_key_value,
-            softmax_scale=self.softmax_scale,
-            attn_bias=attn_bias,
-            key_padding_mask=key_padding_mask,
-            is_causal=is_causal,
-            dropout_p=self.attn_dropout_p,
-            training=self.training,
-            needs_weights=needs_weights,
-            multiquery=True,
-        )
-        return (self.out_proj(context), attn_weights, past_key_value)
-def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id):
-    if attn_impl == "flash":
-        return None
-    elif attn_impl in ["torch", "triton"]:
-        if alibi:
-            if (prefix_lm or not causal) or use_sequence_id:
-                return (1, n_heads, seq_len, seq_len)
-            return (1, n_heads, 1, seq_len)
-        elif prefix_lm or use_sequence_id:
-            return (1, 1, seq_len, seq_len)
-        return None
-    else:
-        raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
-def build_attn_bias(attn_impl, attn_bias, n_heads, seq_len, causal=False, alibi=False, alibi_bias_max=8):
-    if attn_impl == "flash":
-        return None
-    elif attn_impl in ["torch", "triton"]:
-        if alibi:
-            (device, dtype) = (attn_bias.device, attn_bias.dtype)
-            attn_bias = attn_bias.add(build_alibi_bias(n_heads, seq_len, full=not causal, alibi_bias_max=alibi_bias_max, device=device, dtype=dtype))
-        return attn_bias
-    else:
-        raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
-def gen_slopes(n_heads, alibi_bias_max=8, device=None):
-    _n_heads = 2 ** math.ceil(math.log2(n_heads))
-    m = torch.arange(1, _n_heads + 1, dtype=torch.float32, device=device)
-    m = m.mul(alibi_bias_max / _n_heads)
-    slopes = 1.0 / torch.pow(2, m)
-    if _n_heads != n_heads:
-        slopes = torch.concat([slopes[1::2], slopes[::2]])[:n_heads]
-    return slopes.view(1, n_heads, 1, 1)
-def build_alibi_bias(n_heads, seq_len, full=False, alibi_bias_max=8, device=None, dtype=None):
-    alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, 1, seq_len)
-    if full:
-        alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, seq_len, 1)
-        alibi_bias = alibi_bias.abs().mul(-1)
-    slopes = gen_slopes(n_heads, alibi_bias_max, device=device)
-    alibi_bias = alibi_bias * slopes
-    return alibi_bias.to(dtype=dtype)
-ATTN_CLASS_REGISTRY = {"multihead_attention": MultiheadAttention, "multiquery_attention": MultiQueryAttention}

mllm/flamingo/mpt/blocks.py DELETED Viewed

@@ -1,82 +0,0 @@
-"""GPT Blocks used for the GPT Model."""
-from typing import Dict, Optional, Tuple
-import torch
-import torch.nn as nn
-from .attention import ATTN_CLASS_REGISTRY
-from .norm import NORM_CLASS_REGISTRY
-class MPTMLP(nn.Module):
-    def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str] = None):
-        super().__init__()
-        self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
-        ## yh: hard code
-        # self.act = nn.GELU(approximate='none')
-        self.act = nn.GELU()
-        self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
-        self.down_proj._is_residual = True
-    def forward(self, x):
-        return self.down_proj(self.act(self.up_proj(x)))
-class MPTBlock(nn.Module):
-    def __init__(
-        self,
-        d_model: int,
-        n_heads: int,
-        expansion_ratio: int,
-        attn_config: Dict = {
-            "attn_type": "multihead_attention",
-            "attn_pdrop": 0.0,
-            "attn_impl": "triton",
-            "qk_ln": False,
-            "clip_qkv": None,
-            "softmax_scale": None,
-            "prefix_lm": False,
-            "attn_uses_sequence_id": False,
-            "alibi": False,
-            "alibi_bias_max": 8,
-        },
-        resid_pdrop: float = 0.0,
-        norm_type: str = "low_precision_layernorm",
-        verbose: int = 0,
-        device: Optional[str] = None,
-        **kwargs
-    ):
-        del kwargs
-        super().__init__()
-        norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
-        attn_class = ATTN_CLASS_REGISTRY[attn_config["attn_type"]]
-        self.norm_1 = norm_class(d_model, device=device)
-        self.attn = attn_class(
-            attn_impl=attn_config["attn_impl"],
-            clip_qkv=attn_config["clip_qkv"],
-            qk_ln=attn_config["qk_ln"],
-            softmax_scale=attn_config["softmax_scale"],
-            attn_pdrop=attn_config["attn_pdrop"],
-            d_model=d_model,
-            n_heads=n_heads,
-            verbose=verbose,
-            device=device,
-        )
-        self.norm_2 = norm_class(d_model, device=device)
-        self.ffn = MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, device=device)
-        self.resid_attn_dropout = nn.Dropout(resid_pdrop)
-        self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
-    def forward(
-        self,
-        x: torch.Tensor,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attn_bias: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.ByteTensor] = None,
-        is_causal: bool = True,
-    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
-        a = self.norm_1(x)
-        (b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
-        x = x + self.resid_attn_dropout(b)
-        m = self.norm_2(x)
-        n = self.ffn(m)
-        x = x + self.resid_ffn_dropout(n)
-        return (x, attn_weights, past_key_value)

mllm/flamingo/mpt/configuration_mpt.py DELETED Viewed

@@ -1,161 +0,0 @@
-"""A HuggingFace-style model configuration."""
-from typing import Dict, Optional, Union
-from transformers import PretrainedConfig
-attn_config_defaults: Dict = {
-    "attn_type": "multihead_attention",
-    "attn_pdrop": 0.0,
-    "attn_impl": "triton",
-    "qk_ln": False,
-    "clip_qkv": None,
-    "softmax_scale": None,
-    "prefix_lm": False,
-    "attn_uses_sequence_id": False,
-    "alibi": False,
-    "alibi_bias_max": 8,
-}
-init_config_defaults: Dict = {
-    "name": "kaiming_normal_",
-    "fan_mode": "fan_in",
-    "init_nonlinearity": "relu",
-    "init_div_is_residual": True,
-    "emb_init_std": None,
-    "emb_init_uniform_lim": None,
-    "init_std": None,
-    "init_gain": 0.0,
-}
-class MPTConfig(PretrainedConfig):
-    model_type = "mpt"
-    def __init__(
-        self,
-        d_model: int = 2048,
-        n_heads: int = 16,
-        n_layers: int = 24,
-        expansion_ratio: int = 4,
-        max_seq_len: int = 2048,
-        vocab_size: int = 50368,
-        resid_pdrop: float = 0.0,
-        emb_pdrop: float = 0.0,
-        learned_pos_emb: bool = True,
-        attn_config: Dict = attn_config_defaults,
-        init_device: str = "cpu",
-        logit_scale: Optional[Union[float, str]] = None,
-        no_bias: bool = False,
-        verbose: int = 0,
-        embedding_fraction: float = 1.0,
-        norm_type: str = "low_precision_layernorm",
-        use_cache: bool = False,
-        init_config: Dict = init_config_defaults,
-        **kwargs,
-    ):
-        """The MPT configuration class.
-        Args:
-            d_model (int): The size of the embedding dimension of the model.
-            n_heads (int): The number of attention heads.
-            n_layers (int): The number of layers in the model.
-            expansion_ratio (int): The ratio of the up/down scale in the MLP.
-            max_seq_len (int): The maximum sequence length of the model.
-            vocab_size (int): The size of the vocabulary.
-            resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
-            emb_pdrop (float): The dropout probability for the embedding layer.
-            learned_pos_emb (bool): Whether to use learned positional embeddings
-            attn_config (Dict):  A dictionary used to configure the model's attention module:
-                attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
-                attn_pdrop (float): The dropout probability for the attention layers.
-                attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
-                qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
-                clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
-                    this value.
-                softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
-                    use the default scale of ``1/sqrt(d_keys)``.
-                prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
-                    extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
-                    can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
-                attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
-                    When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
-                    which sub-sequence each token belongs to.
-                    Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
-                alibi (bool): Whether to use the alibi bias instead of position embeddings.
-                alibi_bias_max (int): The maximum value of the alibi bias.
-            init_device (str): The device to use for parameter initialization.
-            logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
-            no_bias (bool): Whether to use bias in all layers.
-            verbose (int): The verbosity level. 0 is silent.
-            embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
-            norm_type (str): choose type of norm to use
-            multiquery_attention (bool): Whether to use multiquery attention implementation.
-            use_cache (bool): Whether or not the model should return the last key/values attentions
-            init_config (Dict): A dictionary used to configure the model initialization:
-                init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
-                    'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
-                    'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
-                init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
-                emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
-                emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
-                    used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
-                init_std (float): The standard deviation of the normal distribution used to initialize the model,
-                    if using the baseline_ parameter initialization scheme.
-                init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
-                fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
-                init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
-                ---
-                See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
-        """
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.expansion_ratio = expansion_ratio
-        self.max_seq_len = max_seq_len
-        self.vocab_size = vocab_size
-        self.resid_pdrop = resid_pdrop
-        self.emb_pdrop = emb_pdrop
-        self.learned_pos_emb = learned_pos_emb
-        self.attn_config = attn_config
-        self.init_device = init_device
-        self.logit_scale = logit_scale
-        self.no_bias = no_bias
-        self.verbose = verbose
-        self.embedding_fraction = embedding_fraction
-        self.norm_type = norm_type
-        self.use_cache = use_cache
-        self.init_config = init_config
-        if "name" in kwargs:
-            del kwargs["name"]
-        if "loss_fn" in kwargs:
-            del kwargs["loss_fn"]
-        super().__init__(**kwargs)
-        self._validate_config()
-    def _set_config_defaults(self, config, config_defaults):
-        for k, v in config_defaults.items():
-            if k not in config:
-                config[k] = v
-        return config
-    def _validate_config(self):
-        self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
-        self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
-        if self.d_model % self.n_heads != 0:
-            raise ValueError("d_model must be divisible by n_heads")
-        if any((prob < 0 or prob > 1 for prob in [self.attn_config["attn_pdrop"], self.resid_pdrop, self.emb_pdrop])):
-            raise ValueError("self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1")
-        if self.attn_config["attn_impl"] not in ["torch", "flash", "triton"]:
-            raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}")
-        if self.attn_config["prefix_lm"] and self.attn_config["attn_impl"] not in ["torch", "triton"]:
-            raise NotImplementedError("prefix_lm only implemented with torch and triton attention.")
-        if self.attn_config["alibi"] and self.attn_config["attn_impl"] not in ["torch", "triton"]:
-            raise NotImplementedError("alibi only implemented with torch and triton attention.")
-        if self.attn_config["attn_uses_sequence_id"] and self.attn_config["attn_impl"] not in ["torch", "triton"]:
-            raise NotImplementedError("attn_uses_sequence_id only implemented with torch and triton attention.")
-        if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
-            raise ValueError("model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!")
-        if isinstance(self.logit_scale, str) and self.logit_scale != "inv_sqrt_d_model":
-            raise ValueError(f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
-        if self.init_config.get("name", None) is None:
-            raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
-        if not self.learned_pos_emb and (not self.attn_config["alibi"]):
-            raise ValueError(f"Positional information must be provided to the model using either learned_pos_emb or alibi.")

mllm/flamingo/mpt/custom_embedding.py DELETED Viewed

@@ -1,11 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch import Tensor
-class SharedEmbedding(nn.Embedding):
-    def forward(self, input: Tensor, unembed: bool = False) -> Tensor:
-        if unembed:
-            return F.linear(input, self.weight)
-        return super().forward(input)

mllm/flamingo/mpt/flash_attn_triton.py DELETED Viewed

@@ -1,841 +0,0 @@
-"""
-Copied from https://github.com/HazyResearch/flash-attention/blob/eff9fe6b8076df59d64d7a3f464696738a3c7c24/flash_attn/flash_attn_triton.py
-update imports to use 'triton_pre_mlir'
-*Experimental* implementation of FlashAttention in Triton.
-Tested with triton==2.0.0.dev20221202.
-Triton 2.0 has a new backend (MLIR) but seems like it doesn't yet work for head dimensions
-other than 64:
-https://github.com/openai/triton/blob/d376020f90002757eea3ea9475d4f7cfc2ec5ead/python/triton/ops/flash_attention.py#L207
-We'll update this implementation with the new Triton backend once this is fixed.
-We use the FlashAttention implementation from Phil Tillet a starting point.
-https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
-Changes:
-- Implement both causal and non-causal attention.
-- Implement both self-attention and cross-attention.
-- Support arbitrary seqlens (not just multiples of 128), for both forward and backward.
-- Support all head dimensions up to 128 (not just 16, 32, 64, 128), for both forward and backward.
-- Support attention bias.
-- Speed up the forward pass a bit, and only store the LSE instead of m and l.
-- Make the backward for d=128 much faster by reducing register spilling.
-- Optionally parallelize the backward pass across seqlen_k, to deal with the case of
-small batch size * nheads.
-Caution:
-- This is an *experimental* implementation. The forward pass should be quite robust but
-I'm not 100% sure that the backward pass doesn't have race conditions (due to the Triton compiler).
-- This implementation has only been tested on A100.
-- If you plan to use headdim other than 64 and 128, you should test for race conditions
-(due to the Triton compiler), as done in tests/test_flash_attn.py
-"test_flash_attn_triton_race_condition". I've tested and fixed many race conditions
-for different head dimensions (40, 48, 64, 128, 80, 88, 96), but I'm still not 100% confident
-that there are none left for other head dimensions.
-Differences between this Triton version and the CUDA version:
-- Triton version doesn't support dropout.
-- Triton forward is generally faster than CUDA forward, while Triton backward is
-generally slower than CUDA backward. Overall Triton forward + backward is slightly slower
-than CUDA forward + backward.
-- Triton version doesn't support different sequence lengths in a batch (i.e., RaggedTensor/NestedTensor).
-- Triton version supports attention bias, while CUDA version doesn't.
-"""
-import math
-import torch
-import triton_pre_mlir as triton
-import triton_pre_mlir.language as tl
-@triton.heuristics(
-    {
-        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
-        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
-        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
-    }
-)
-@triton.jit
-def _fwd_kernel(
-    Q,
-    K,
-    V,
-    Bias,
-    Out,
-    Lse,
-    TMP,
-    softmax_scale,
-    stride_qb,
-    stride_qh,
-    stride_qm,
-    stride_kb,
-    stride_kh,
-    stride_kn,
-    stride_vb,
-    stride_vh,
-    stride_vn,
-    stride_bb,
-    stride_bh,
-    stride_bm,
-    stride_ob,
-    stride_oh,
-    stride_om,
-    nheads,
-    seqlen_q,
-    seqlen_k,
-    seqlen_q_rounded,
-    headdim,
-    CACHE_KEY_SEQLEN_Q,
-    CACHE_KEY_SEQLEN_K,
-    BIAS_TYPE: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_HEADDIM: tl.constexpr,
-    EVEN_M: tl.constexpr,
-    EVEN_N: tl.constexpr,
-    EVEN_HEADDIM: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    start_m = tl.program_id(0)
-    off_hb = tl.program_id(1)
-    off_b = off_hb // nheads
-    off_h = off_hb % nheads
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_n = tl.arange(0, BLOCK_N)
-    offs_d = tl.arange(0, BLOCK_HEADDIM)
-    q_ptrs = Q + off_b * stride_qb + off_h * stride_qh + (offs_m[:, None] * stride_qm + offs_d[None, :])
-    k_ptrs = K + off_b * stride_kb + off_h * stride_kh + (offs_n[:, None] * stride_kn + offs_d[None, :])
-    v_ptrs = V + off_b * stride_vb + off_h * stride_vh + (offs_n[:, None] * stride_vn + offs_d[None, :])
-    if BIAS_TYPE == "vector":
-        b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + offs_n
-    elif BIAS_TYPE == "matrix":
-        b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + (offs_m[:, None] * stride_bm + offs_n[None, :])
-    t_ptrs = TMP + off_hb * seqlen_q_rounded + offs_m
-    lse_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-    acc_o = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32)
-    if EVEN_M & EVEN_N:
-        if EVEN_HEADDIM:
-            q = tl.load(q_ptrs)
-        else:
-            q = tl.load(q_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
-    elif EVEN_HEADDIM:
-        q = tl.load(q_ptrs, mask=offs_m[:, None] < seqlen_q, other=0.0)
-    else:
-        q = tl.load(q_ptrs, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0)
-    end_n = seqlen_k if not IS_CAUSAL else tl.minimum((start_m + 1) * BLOCK_M, seqlen_k)
-    for start_n in range(0, end_n, BLOCK_N):
-        start_n = tl.multiple_of(start_n, BLOCK_N)
-        if EVEN_N & EVEN_M:
-            if EVEN_HEADDIM:
-                k = tl.load(k_ptrs + start_n * stride_kn)
-            else:
-                k = tl.load(k_ptrs + start_n * stride_kn, mask=offs_d[None, :] < headdim, other=0.0)
-        elif EVEN_HEADDIM:
-            k = tl.load(k_ptrs + start_n * stride_kn, mask=(start_n + offs_n)[:, None] < seqlen_k, other=0.0)
-        else:
-            k = tl.load(k_ptrs + start_n * stride_kn, mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k, trans_b=True)
-        if not EVEN_N:
-            qk += tl.where((start_n + offs_n)[None, :] < seqlen_k, 0, float("-inf"))
-        if IS_CAUSAL:
-            qk += tl.where(offs_m[:, None] >= (start_n + offs_n)[None, :], 0, float("-inf"))
-        if BIAS_TYPE != "none":
-            if BIAS_TYPE == "vector":
-                if EVEN_N:
-                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
-                else:
-                    bias = tl.load(b_ptrs + start_n, mask=start_n + offs_n < seqlen_k, other=0.0).to(tl.float32)
-                bias = bias[None, :]
-            elif BIAS_TYPE == "matrix":
-                if EVEN_M & EVEN_N:
-                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
-                else:
-                    bias = tl.load(b_ptrs + start_n, mask=(offs_m[:, None] < seqlen_q) & ((start_n + offs_n)[None, :] < seqlen_k), other=0.0).to(tl.float32)
-            qk = qk * softmax_scale + bias
-            m_ij = tl.maximum(tl.max(qk, 1), lse_i)
-            p = tl.exp(qk - m_ij[:, None])
-        else:
-            m_ij = tl.maximum(tl.max(qk, 1) * softmax_scale, lse_i)
-            p = tl.exp(qk * softmax_scale - m_ij[:, None])
-        l_ij = tl.sum(p, 1)
-        acc_o_scale = tl.exp(m_i - m_ij)
-        tl.store(t_ptrs, acc_o_scale)
-        acc_o_scale = tl.load(t_ptrs)
-        acc_o = acc_o * acc_o_scale[:, None]
-        if EVEN_N & EVEN_M:
-            if EVEN_HEADDIM:
-                v = tl.load(v_ptrs + start_n * stride_vn)
-            else:
-                v = tl.load(v_ptrs + start_n * stride_vn, mask=offs_d[None, :] < headdim, other=0.0)
-        elif EVEN_HEADDIM:
-            v = tl.load(v_ptrs + start_n * stride_vn, mask=(start_n + offs_n)[:, None] < seqlen_k, other=0.0)
-        else:
-            v = tl.load(v_ptrs + start_n * stride_vn, mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
-        p = p.to(v.dtype)
-        acc_o += tl.dot(p, v)
-        m_i = m_ij
-        l_i_new = tl.exp(lse_i - m_ij) + l_ij
-        lse_i = m_ij + tl.log(l_i_new)
-    o_scale = tl.exp(m_i - lse_i)
-    tl.store(t_ptrs, o_scale)
-    o_scale = tl.load(t_ptrs)
-    acc_o = acc_o * o_scale[:, None]
-    start_m = tl.program_id(0)
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    lse_ptrs = Lse + off_hb * seqlen_q_rounded + offs_m
-    tl.store(lse_ptrs, lse_i)
-    offs_d = tl.arange(0, BLOCK_HEADDIM)
-    out_ptrs = Out + off_b * stride_ob + off_h * stride_oh + (offs_m[:, None] * stride_om + offs_d[None, :])
-    if EVEN_M:
-        if EVEN_HEADDIM:
-            tl.store(out_ptrs, acc_o)
-        else:
-            tl.store(out_ptrs, acc_o, mask=offs_d[None, :] < headdim)
-    elif EVEN_HEADDIM:
-        tl.store(out_ptrs, acc_o, mask=offs_m[:, None] < seqlen_q)
-    else:
-        tl.store(out_ptrs, acc_o, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim))
-@triton.jit
-def _bwd_preprocess_do_o_dot(
-    Out,
-    DO,
-    Delta,
-    stride_ob,
-    stride_oh,
-    stride_om,
-    stride_dob,
-    stride_doh,
-    stride_dom,
-    nheads,
-    seqlen_q,
-    seqlen_q_rounded,
-    headdim,
-    BLOCK_M: tl.constexpr,
-    BLOCK_HEADDIM: tl.constexpr,
-):
-    start_m = tl.program_id(0)
-    off_hb = tl.program_id(1)
-    off_b = off_hb // nheads
-    off_h = off_hb % nheads
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_d = tl.arange(0, BLOCK_HEADDIM)
-    o = tl.load(
-        Out + off_b * stride_ob + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :],
-        mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-        other=0.0,
-    ).to(tl.float32)
-    do = tl.load(
-        DO + off_b * stride_dob + off_h * stride_doh + offs_m[:, None] * stride_dom + offs_d[None, :],
-        mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-        other=0.0,
-    ).to(tl.float32)
-    delta = tl.sum(o * do, axis=1)
-    tl.store(Delta + off_hb * seqlen_q_rounded + offs_m, delta)
-@triton.jit
-def _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr):
-    if EVEN_N & EVEN_M:
-        if EVEN_HEADDIM:
-            tl.store(dv_ptrs, dv)
-            tl.store(dk_ptrs, dk)
-        else:
-            tl.store(dv_ptrs, dv, mask=offs_d[None, :] < headdim)
-            tl.store(dk_ptrs, dk, mask=offs_d[None, :] < headdim)
-    elif EVEN_HEADDIM:
-        tl.store(dv_ptrs, dv, mask=offs_n[:, None] < seqlen_k)
-        tl.store(dk_ptrs, dk, mask=offs_n[:, None] < seqlen_k)
-    else:
-        tl.store(dv_ptrs, dv, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
-        tl.store(dk_ptrs, dk, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
-@triton.jit
-def _bwd_kernel_one_col_block(
-    start_n,
-    Q,
-    K,
-    V,
-    Bias,
-    DO,
-    DQ,
-    DK,
-    DV,
-    LSE,
-    D,
-    softmax_scale,
-    stride_qm,
-    stride_kn,
-    stride_vn,
-    stride_bm,
-    stride_dom,
-    stride_dqm,
-    stride_dkn,
-    stride_dvn,
-    seqlen_q,
-    seqlen_k,
-    headdim,
-    ATOMIC_ADD: tl.constexpr,
-    BIAS_TYPE: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_HEADDIM: tl.constexpr,
-    EVEN_M: tl.constexpr,
-    EVEN_N: tl.constexpr,
-    EVEN_HEADDIM: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    begin_m = 0 if not IS_CAUSAL else start_n * BLOCK_N // BLOCK_M * BLOCK_M
-    offs_qm = begin_m + tl.arange(0, BLOCK_M)
-    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
-    offs_m = tl.arange(0, BLOCK_M)
-    offs_d = tl.arange(0, BLOCK_HEADDIM)
-    q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_d[None, :])
-    k_ptrs = K + (offs_n[:, None] * stride_kn + offs_d[None, :])
-    v_ptrs = V + (offs_n[:, None] * stride_vn + offs_d[None, :])
-    do_ptrs = DO + (offs_qm[:, None] * stride_dom + offs_d[None, :])
-    dq_ptrs = DQ + (offs_qm[:, None] * stride_dqm + offs_d[None, :])
-    if BIAS_TYPE == "vector":
-        b_ptrs = Bias + offs_n
-    elif BIAS_TYPE == "matrix":
-        b_ptrs = Bias + (offs_qm[:, None] * stride_bm + offs_n[None, :])
-    dv = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
-    dk = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
-    if begin_m >= seqlen_q:
-        dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
-        dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
-        _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM)
-        return
-    if EVEN_N & EVEN_M:
-        if EVEN_HEADDIM:
-            k = tl.load(k_ptrs)
-            v = tl.load(v_ptrs)
-        else:
-            k = tl.load(k_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
-            v = tl.load(v_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
-    elif EVEN_HEADDIM:
-        k = tl.load(k_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
-        v = tl.load(v_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
-    else:
-        k = tl.load(k_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
-        v = tl.load(v_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
-    num_block_m = tl.cdiv(seqlen_q, BLOCK_M)
-    for start_m in range(begin_m, num_block_m * BLOCK_M, BLOCK_M):
-        start_m = tl.multiple_of(start_m, BLOCK_M)
-        offs_m_curr = start_m + offs_m
-        if EVEN_M & EVEN_HEADDIM:
-            q = tl.load(q_ptrs)
-        elif EVEN_HEADDIM:
-            q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)
-        else:
-            q = tl.load(q_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0)
-        qk = tl.dot(q, k, trans_b=True)
-        if not EVEN_N:
-            qk = tl.where(offs_n[None, :] < seqlen_k, qk, float("-inf"))
-        if IS_CAUSAL:
-            qk = tl.where(offs_m_curr[:, None] >= offs_n[None, :], qk, float("-inf"))
-        if BIAS_TYPE != "none":
-            tl.debug_barrier()
-            if BIAS_TYPE == "vector":
-                if EVEN_N:
-                    bias = tl.load(b_ptrs).to(tl.float32)
-                else:
-                    bias = tl.load(b_ptrs, mask=offs_n < seqlen_k, other=0.0).to(tl.float32)
-                bias = bias[None, :]
-            elif BIAS_TYPE == "matrix":
-                if EVEN_M & EVEN_N:
-                    bias = tl.load(b_ptrs).to(tl.float32)
-                else:
-                    bias = tl.load(b_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_n[None, :] < seqlen_k), other=0.0).to(tl.float32)
-            qk = qk * softmax_scale + bias
-        if not EVEN_M & EVEN_HEADDIM:
-            tl.debug_barrier()
-        lse_i = tl.load(LSE + offs_m_curr)
-        if BIAS_TYPE == "none":
-            p = tl.exp(qk * softmax_scale - lse_i[:, None])
-        else:
-            p = tl.exp(qk - lse_i[:, None])
-        if EVEN_M & EVEN_HEADDIM:
-            do = tl.load(do_ptrs)
-        else:
-            do = tl.load(do_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0)
-        dv += tl.dot(p.to(do.dtype), do, trans_a=True)
-        if not EVEN_M & EVEN_HEADDIM:
-            tl.debug_barrier()
-        dp = tl.dot(do, v, trans_b=True)
-        if not EVEN_HEADDIM:
-            tl.debug_barrier()
-        Di = tl.load(D + offs_m_curr)
-        ds = (p * (dp - Di[:, None]) * softmax_scale).to(q.dtype)
-        dk += tl.dot(ds, q, trans_a=True)
-        if not EVEN_M & EVEN_HEADDIM:
-            tl.debug_barrier()
-        if not ATOMIC_ADD:
-            if EVEN_M & EVEN_HEADDIM:
-                dq = tl.load(dq_ptrs, eviction_policy="evict_last")
-                dq += tl.dot(ds, k)
-                tl.store(dq_ptrs, dq, eviction_policy="evict_last")
-            elif EVEN_HEADDIM:
-                dq = tl.load(dq_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0, eviction_policy="evict_last")
-                dq += tl.dot(ds, k)
-                tl.store(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q, eviction_policy="evict_last")
-            else:
-                dq = tl.load(dq_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0, eviction_policy="evict_last")
-                dq += tl.dot(ds, k)
-                tl.store(dq_ptrs, dq, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), eviction_policy="evict_last")
-        else:
-            dq = tl.dot(ds, k)
-            if EVEN_M & EVEN_HEADDIM:
-                tl.atomic_add(dq_ptrs, dq)
-            elif EVEN_HEADDIM:
-                tl.atomic_add(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q)
-            else:
-                tl.atomic_add(dq_ptrs, dq, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim))
-        dq_ptrs += BLOCK_M * stride_dqm
-        q_ptrs += BLOCK_M * stride_qm
-        do_ptrs += BLOCK_M * stride_dom
-        if BIAS_TYPE == "matrix":
-            b_ptrs += BLOCK_M * stride_bm
-    dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
-    dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
-    _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM)
-def init_to_zero(name):
-    return lambda nargs: nargs[name].zero_()
-@triton.autotune(
-    configs=[
-        triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "SEQUENCE_PARALLEL": False}, num_warps=8, num_stages=1, pre_hook=init_to_zero("DQ")),
-        triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "SEQUENCE_PARALLEL": True}, num_warps=8, num_stages=1, pre_hook=init_to_zero("DQ")),
-    ],
-    key=["CACHE_KEY_SEQLEN_Q", "CACHE_KEY_SEQLEN_K", "BIAS_TYPE", "IS_CAUSAL", "BLOCK_HEADDIM"],
-)
-@triton.heuristics(
-    {
-        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
-        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
-        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
-    }
-)
-@triton.jit
-def _bwd_kernel(
-    Q,
-    K,
-    V,
-    Bias,
-    DO,
-    DQ,
-    DK,
-    DV,
-    LSE,
-    D,
-    softmax_scale,
-    stride_qb,
-    stride_qh,
-    stride_qm,
-    stride_kb,
-    stride_kh,
-    stride_kn,
-    stride_vb,
-    stride_vh,
-    stride_vn,
-    stride_bb,
-    stride_bh,
-    stride_bm,
-    stride_dob,
-    stride_doh,
-    stride_dom,
-    stride_dqb,
-    stride_dqh,
-    stride_dqm,
-    stride_dkb,
-    stride_dkh,
-    stride_dkn,
-    stride_dvb,
-    stride_dvh,
-    stride_dvn,
-    nheads,
-    seqlen_q,
-    seqlen_k,
-    seqlen_q_rounded,
-    headdim,
-    CACHE_KEY_SEQLEN_Q,
-    CACHE_KEY_SEQLEN_K,
-    BIAS_TYPE: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_HEADDIM: tl.constexpr,
-    SEQUENCE_PARALLEL: tl.constexpr,
-    EVEN_M: tl.constexpr,
-    EVEN_N: tl.constexpr,
-    EVEN_HEADDIM: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    off_hb = tl.program_id(1)
-    off_b = off_hb // nheads
-    off_h = off_hb % nheads
-    Q += off_b * stride_qb + off_h * stride_qh
-    K += off_b * stride_kb + off_h * stride_kh
-    V += off_b * stride_vb + off_h * stride_vh
-    DO += off_b * stride_dob + off_h * stride_doh
-    DQ += off_b * stride_dqb + off_h * stride_dqh
-    DK += off_b * stride_dkb + off_h * stride_dkh
-    DV += off_b * stride_dvb + off_h * stride_dvh
-    if BIAS_TYPE != "none":
-        Bias += off_b * stride_bb + off_h * stride_bh
-    D += off_hb * seqlen_q_rounded
-    LSE += off_hb * seqlen_q_rounded
-    if not SEQUENCE_PARALLEL:
-        num_block_n = tl.cdiv(seqlen_k, BLOCK_N)
-        for start_n in range(0, num_block_n):
-            _bwd_kernel_one_col_block(
-                start_n,
-                Q,
-                K,
-                V,
-                Bias,
-                DO,
-                DQ,
-                DK,
-                DV,
-                LSE,
-                D,
-                softmax_scale,
-                stride_qm,
-                stride_kn,
-                stride_vn,
-                stride_bm,
-                stride_dom,
-                stride_dqm,
-                stride_dkn,
-                stride_dvn,
-                seqlen_q,
-                seqlen_k,
-                headdim,
-                ATOMIC_ADD=False,
-                BIAS_TYPE=BIAS_TYPE,
-                IS_CAUSAL=IS_CAUSAL,
-                BLOCK_HEADDIM=BLOCK_HEADDIM,
-                EVEN_M=EVEN_M,
-                EVEN_N=EVEN_N,
-                EVEN_HEADDIM=EVEN_HEADDIM,
-                BLOCK_M=BLOCK_M,
-                BLOCK_N=BLOCK_N,
-            )
-    else:
-        start_n = tl.program_id(0)
-        _bwd_kernel_one_col_block(
-            start_n,
-            Q,
-            K,
-            V,
-            Bias,
-            DO,
-            DQ,
-            DK,
-            DV,
-            LSE,
-            D,
-            softmax_scale,
-            stride_qm,
-            stride_kn,
-            stride_vn,
-            stride_bm,
-            stride_dom,
-            stride_dqm,
-            stride_dkn,
-            stride_dvn,
-            seqlen_q,
-            seqlen_k,
-            headdim,
-            ATOMIC_ADD=True,
-            BIAS_TYPE=BIAS_TYPE,
-            IS_CAUSAL=IS_CAUSAL,
-            BLOCK_HEADDIM=BLOCK_HEADDIM,
-            EVEN_M=EVEN_M,
-            EVEN_N=EVEN_N,
-            EVEN_HEADDIM=EVEN_HEADDIM,
-            BLOCK_M=BLOCK_M,
-            BLOCK_N=BLOCK_N,
-        )
-def _flash_attn_forward(q, k, v, bias=None, causal=False, softmax_scale=None):
-    (batch, seqlen_q, nheads, d) = q.shape
-    (_, seqlen_k, _, _) = k.shape
-    assert k.shape == (batch, seqlen_k, nheads, d)
-    assert v.shape == (batch, seqlen_k, nheads, d)
-    assert d <= 128, "FlashAttention only support head dimensions up to 128"
-    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same type"
-    assert q.dtype in [torch.float16, torch.bfloat16], "Only support fp16 and bf16"
-    assert q.is_cuda and k.is_cuda and v.is_cuda
-    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
-    has_bias = bias is not None
-    bias_type = "none"
-    if has_bias:
-        assert bias.dtype in [q.dtype, torch.float]
-        assert bias.is_cuda
-        assert bias.dim() == 4
-        if bias.stride(-1) != 1:
-            bias = bias.contiguous()
-        if bias.shape[2:] == (1, seqlen_k):
-            bias_type = "vector"
-        elif bias.shape[2:] == (seqlen_q, seqlen_k):
-            bias_type = "matrix"
-        else:
-            raise RuntimeError("Last 2 dimensions of bias must be (1, seqlen_k) or (seqlen_q, seqlen_k)")
-        bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
-    bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
-    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
-    lse = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
-    tmp = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
-    o = torch.empty_like(q)
-    BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
-    BLOCK = 128
-    num_warps = 4 if d <= 64 else 8
-    grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads)
-    _fwd_kernel[grid](
-        q,
-        k,
-        v,
-        bias,
-        o,
-        lse,
-        tmp,
-        softmax_scale,
-        q.stride(0),
-        q.stride(2),
-        q.stride(1),
-        k.stride(0),
-        k.stride(2),
-        k.stride(1),
-        v.stride(0),
-        v.stride(2),
-        v.stride(1),
-        *bias_strides,
-        o.stride(0),
-        o.stride(2),
-        o.stride(1),
-        nheads,
-        seqlen_q,
-        seqlen_k,
-        seqlen_q_rounded,
-        d,
-        seqlen_q // 32,
-        seqlen_k // 32,
-        bias_type,
-        causal,
-        BLOCK_HEADDIM,
-        BLOCK_M=BLOCK,
-        BLOCK_N=BLOCK,
-        num_warps=num_warps,
-        num_stages=1
-    )
-    return (o, lse, softmax_scale)
-def _flash_attn_backward(do, q, k, v, o, lse, dq, dk, dv, bias=None, causal=False, softmax_scale=None):
-    if do.stride(-1) != 1:
-        do = do.contiguous()
-    (batch, seqlen_q, nheads, d) = q.shape
-    (_, seqlen_k, _, _) = k.shape
-    assert d <= 128
-    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
-    assert lse.shape == (batch, nheads, seqlen_q_rounded)
-    assert q.stride(-1) == k.stride(-1) == v.stride(-1) == o.stride(-1) == 1
-    assert dq.stride(-1) == dk.stride(-1) == dv.stride(-1) == 1
-    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
-    dq_accum = torch.empty_like(q, dtype=torch.float32)
-    delta = torch.empty_like(lse)
-    BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
-    grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads)
-    _bwd_preprocess_do_o_dot[grid](
-        o,
-        do,
-        delta,
-        o.stride(0),
-        o.stride(2),
-        o.stride(1),
-        do.stride(0),
-        do.stride(2),
-        do.stride(1),
-        nheads,
-        seqlen_q,
-        seqlen_q_rounded,
-        d,
-        BLOCK_M=128,
-        BLOCK_HEADDIM=BLOCK_HEADDIM,
-    )
-    has_bias = bias is not None
-    bias_type = "none"
-    if has_bias:
-        assert bias.dtype in [q.dtype, torch.float]
-        assert bias.is_cuda
-        assert bias.dim() == 4
-        assert bias.stride(-1) == 1
-        if bias.shape[2:] == (1, seqlen_k):
-            bias_type = "vector"
-        elif bias.shape[2:] == (seqlen_q, seqlen_k):
-            bias_type = "matrix"
-        else:
-            raise RuntimeError("Last 2 dimensions of bias must be (1, seqlen_k) or (seqlen_q, seqlen_k)")
-        bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
-    bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
-    grid = lambda META: (triton.cdiv(seqlen_k, META["BLOCK_N"]) if META["SEQUENCE_PARALLEL"] else 1, batch * nheads)
-    _bwd_kernel[grid](
-        q,
-        k,
-        v,
-        bias,
-        do,
-        dq_accum,
-        dk,
-        dv,
-        lse,
-        delta,
-        softmax_scale,
-        q.stride(0),
-        q.stride(2),
-        q.stride(1),
-        k.stride(0),
-        k.stride(2),
-        k.stride(1),
-        v.stride(0),
-        v.stride(2),
-        v.stride(1),
-        *bias_strides,
-        do.stride(0),
-        do.stride(2),
-        do.stride(1),
-        dq_accum.stride(0),
-        dq_accum.stride(2),
-        dq_accum.stride(1),
-        dk.stride(0),
-        dk.stride(2),
-        dk.stride(1),
-        dv.stride(0),
-        dv.stride(2),
-        dv.stride(1),
-        nheads,
-        seqlen_q,
-        seqlen_k,
-        seqlen_q_rounded,
-        d,
-        seqlen_q // 32,
-        seqlen_k // 32,
-        bias_type,
-        causal,
-        BLOCK_HEADDIM
-    )
-    dq.copy_(dq_accum)
-class FlashAttnQKVPackedFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, qkv, bias=None, causal=False, softmax_scale=None):
-        """
-        qkv: (batch, seqlen, 3, nheads, headdim)
-        bias: optional, shape broadcastible to (batch, nheads, seqlen, seqlen).
-            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen).
-            ALiBi mask for non-causal would have shape (1, nheads, seqlen, seqlen)
-        """
-        if qkv.stride(-1) != 1:
-            qkv = qkv.contiguous()
-        (o, lse, ctx.softmax_scale) = _flash_attn_forward(qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2], bias=bias, causal=causal, softmax_scale=softmax_scale)
-        ctx.save_for_backward(qkv, o, lse, bias)
-        ctx.causal = causal
-        return o
-    @staticmethod
-    def backward(ctx, do):
-        (qkv, o, lse, bias) = ctx.saved_tensors
-        assert not ctx.needs_input_grad[1], "FlashAttention does not support bias gradient yet"
-        with torch.inference_mode():
-            dqkv = torch.empty_like(qkv)
-            _flash_attn_backward(
-                do,
-                qkv[:, :, 0],
-                qkv[:, :, 1],
-                qkv[:, :, 2],
-                o,
-                lse,
-                dqkv[:, :, 0],
-                dqkv[:, :, 1],
-                dqkv[:, :, 2],
-                bias=bias,
-                causal=ctx.causal,
-                softmax_scale=ctx.softmax_scale,
-            )
-        return (dqkv, None, None, None)
-flash_attn_qkvpacked_func = FlashAttnQKVPackedFunc.apply
-class FlashAttnKVPackedFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, q, kv, bias=None, causal=False, softmax_scale=None):
-        """
-        q: (batch, seqlen_q, nheads, headdim)
-        kv: (batch, seqlen_k, 2, nheads, headdim)
-        bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
-            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
-            ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
-        """
-        (q, kv) = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, kv]]
-        (o, lse, ctx.softmax_scale) = _flash_attn_forward(q, kv[:, :, 0], kv[:, :, 1], bias=bias, causal=causal, softmax_scale=softmax_scale)
-        ctx.save_for_backward(q, kv, o, lse, bias)
-        ctx.causal = causal
-        return o
-    @staticmethod
-    def backward(ctx, do):
-        (q, kv, o, lse, bias) = ctx.saved_tensors
-        if len(ctx.needs_input_grad) >= 3:
-            assert not ctx.needs_input_grad[2], "FlashAttention does not support bias gradient yet"
-        with torch.inference_mode():
-            dq = torch.empty_like(q)
-            dkv = torch.empty_like(kv)
-            _flash_attn_backward(
-                do, q, kv[:, :, 0], kv[:, :, 1], o, lse, dq, dkv[:, :, 0], dkv[:, :, 1], bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale
-            )
-        return (dq, dkv, None, None, None)
-flash_attn_kvpacked_func = FlashAttnKVPackedFunc.apply
-class FlashAttnFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, q, k, v, bias=None, causal=False, softmax_scale=None):
-        """
-        q: (batch_size, seqlen_q, nheads, headdim)
-        k, v: (batch_size, seqlen_k, nheads, headdim)
-        bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
-            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
-            ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
-        """
-        (q, k, v) = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, k, v]]
-        (o, lse, ctx.softmax_scale) = _flash_attn_forward(q, k, v, bias=bias, causal=causal, softmax_scale=softmax_scale)
-        ctx.save_for_backward(q, k, v, o, lse, bias)
-        ctx.causal = causal
-        return o
-    @staticmethod
-    def backward(ctx, do):
-        (q, k, v, o, lse, bias) = ctx.saved_tensors
-        assert not ctx.needs_input_grad[3], "FlashAttention does not support bias gradient yet"
-        with torch.inference_mode():
-            dq = torch.empty_like(q)
-            dk = torch.empty_like(k)
-            dv = torch.empty_like(v)
-            _flash_attn_backward(do, q, k, v, o, lse, dq, dk, dv, bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale)
-        return (dq, dk, dv, None, None, None)
-flash_attn_func = FlashAttnFunc.apply

mllm/flamingo/mpt/hf_prefixlm_converter.py DELETED Viewed

@@ -1,575 +0,0 @@
-"""Converts Huggingface Causal LM to Prefix LM.
-Conversion does lightweight surgery on a HuggingFace
-Causal LM to convert it to a Prefix LM.
-Prefix LMs accepts a `bidirectional_mask` input in `forward`
-and treat the input prompt as the prefix in `generate`.
-"""
-import math
-import warnings
-from types import MethodType
-from typing import Any, Dict, List, Optional, Tuple, Union
-import torch
-from transformers.models.bloom.modeling_bloom import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    BloomForCausalLM,
-    BloomModel,
-    CausalLMOutputWithCrossAttentions,
-    CrossEntropyLoss,
-)
-from transformers.models.bloom.modeling_bloom import _expand_mask as _expand_mask_bloom
-from transformers.models.bloom.modeling_bloom import _make_causal_mask as _make_causal_mask_bloom
-from transformers.models.bloom.modeling_bloom import logging
-from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
-from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoForCausalLM
-from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM
-from transformers.models.gptj.modeling_gptj import GPTJForCausalLM
-from transformers.models.opt.modeling_opt import OPTForCausalLM
-from transformers.models.opt.modeling_opt import _expand_mask as _expand_mask_opt
-from transformers.models.opt.modeling_opt import _make_causal_mask as _make_causal_mask_opt
-logger = logging.get_logger(__name__)
-_SUPPORTED_GPT_MODELS = (GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM)
-CAUSAL_GPT_TYPES = Union[GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM]
-def _convert_gpt_causal_lm_to_prefix_lm(model: CAUSAL_GPT_TYPES) -> CAUSAL_GPT_TYPES:
-    """Converts a GPT-style Causal LM to a Prefix LM.
-    Supported HuggingFace model classes:
-        - `GPT2LMHeadModel`
-        - `GPTNeoForCausalLM`
-        - `GPTNeoXForCausalLM`
-        - `GPTJForCausalLM`
-    See `convert_hf_causal_lm_to_prefix_lm` for more details.
-    """
-    if hasattr(model, "_prefix_lm_converted"):
-        return model
-    assert isinstance(model, _SUPPORTED_GPT_MODELS)
-    assert model.config.add_cross_attention == False, "Only supports GPT-style decoder-only models"
-    def _get_attn_modules(model: CAUSAL_GPT_TYPES) -> List[torch.nn.Module]:
-        """Helper that gets a list of the model's attention modules.
-        Each module has a `bias` buffer used for causal masking. The Prefix LM
-        conversion adds logic to dynamically manipulate these biases to support
-        Prefix LM attention masking.
-        """
-        attn_modules = []
-        if isinstance(model, GPTNeoXForCausalLM):
-            blocks = model.gpt_neox.layers
-        else:
-            blocks = model.transformer.h
-        for block in blocks:
-            if isinstance(model, GPTNeoForCausalLM):
-                if block.attn.attention_type != "global":
-                    continue
-                attn_module = block.attn.attention
-            elif isinstance(model, GPTNeoXForCausalLM):
-                attn_module = block.attention
-            else:
-                attn_module = block.attn
-            attn_modules.append(attn_module)
-        return attn_modules
-    setattr(model, "_original_forward", getattr(model, "forward"))
-    setattr(model, "_original_generate", getattr(model, "generate"))
-    def forward(
-        self: CAUSAL_GPT_TYPES,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        bidirectional_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        """Wraps original forward to enable PrefixLM attention."""
-        def call_og_forward():
-            if isinstance(self, GPTNeoXForCausalLM):
-                return self._original_forward(
-                    input_ids=input_ids,
-                    past_key_values=past_key_values,
-                    attention_mask=attention_mask,
-                    head_mask=head_mask,
-                    inputs_embeds=inputs_embeds,
-                    labels=labels,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                    output_hidden_states=output_hidden_states,
-                    return_dict=return_dict,
-                )
-            else:
-                return self._original_forward(
-                    input_ids=input_ids,
-                    past_key_values=past_key_values,
-                    attention_mask=attention_mask,
-                    token_type_ids=token_type_ids,
-                    position_ids=position_ids,
-                    head_mask=head_mask,
-                    inputs_embeds=inputs_embeds,
-                    labels=labels,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                    output_hidden_states=output_hidden_states,
-                    return_dict=return_dict,
-                )
-        if bidirectional_mask is None:
-            return call_og_forward()
-        assert isinstance(bidirectional_mask, torch.Tensor)
-        attn_modules = _get_attn_modules(model)
-        (b, s) = bidirectional_mask.shape
-        max_length = attn_modules[0].bias.shape[-1]
-        if s > max_length:
-            raise ValueError(f"bidirectional_mask sequence length (={s}) exceeds the " + f"max length allowed by the model ({max_length}).")
-        assert s <= max_length
-        if s < max_length:
-            pad = torch.zeros((int(b), int(max_length - s)), dtype=bidirectional_mask.dtype, device=bidirectional_mask.device)
-            bidirectional_mask = torch.cat([bidirectional_mask, pad], dim=1)
-        bidirectional = bidirectional_mask.unsqueeze(1).unsqueeze(1)
-        for attn_module in attn_modules:
-            attn_module.bias.data = torch.logical_or(attn_module.bias.data, bidirectional)
-        output = call_og_forward()
-        for attn_module in attn_modules:
-            attn_module.bias.data = torch.tril(attn_module.bias.data[0, 0])[None, None]
-        return output
-    def generate(self: CAUSAL_GPT_TYPES, *args: tuple, **kwargs: Dict[str, Any]):
-        """Wraps original generate to enable PrefixLM attention."""
-        attn_modules = _get_attn_modules(model)
-        for attn_module in attn_modules:
-            attn_module.bias.data[:] = 1
-        output = self._original_generate(*args, **kwargs)
-        for attn_module in attn_modules:
-            attn_module.bias.data = torch.tril(attn_module.bias.data[0, 0])[None, None]
-        return output
-    setattr(model, "forward", MethodType(forward, model))
-    setattr(model, "generate", MethodType(generate, model))
-    setattr(model, "_prefix_lm_converted", True)
-    return model
-def _convert_bloom_causal_lm_to_prefix_lm(model: BloomForCausalLM) -> BloomForCausalLM:
-    """Converts a BLOOM Causal LM to a Prefix LM.
-    Supported HuggingFace model classes:
-        - `BloomForCausalLM`
-    See `convert_hf_causal_lm_to_prefix_lm` for more details.
-    """
-    if hasattr(model, "_prefix_lm_converted"):
-        return model
-    assert isinstance(model, BloomForCausalLM)
-    assert model.config.add_cross_attention == False, "Only supports BLOOM decoder-only models"
-    def _prepare_attn_mask(
-        self: BloomModel, attention_mask: torch.Tensor, bidirectional_mask: Optional[torch.Tensor], input_shape: Tuple[int, int], past_key_values_length: int
-    ) -> torch.BoolTensor:
-        combined_attention_mask = None
-        device = attention_mask.device
-        (_, src_length) = input_shape
-        if src_length > 1:
-            combined_attention_mask = _make_causal_mask_bloom(input_shape, device=device, past_key_values_length=past_key_values_length)
-            if bidirectional_mask is not None:
-                assert attention_mask.shape == bidirectional_mask.shape
-                expanded_bidirectional_mask = _expand_mask_bloom(bidirectional_mask, tgt_length=src_length)
-                combined_attention_mask = torch.logical_and(combined_attention_mask, expanded_bidirectional_mask)
-        expanded_attn_mask = _expand_mask_bloom(attention_mask, tgt_length=src_length)
-        combined_attention_mask = expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask
-        return combined_attention_mask
-    def _build_alibi_tensor(self: BloomModel, batch_size: int, query_length: int, key_length: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
-        num_heads = self.config.n_head
-        closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
-        base = torch.tensor(2 ** (-(2 ** (-(math.log2(closest_power_of_2) - 3)))), device=device, dtype=torch.float32)
-        powers = torch.arange(1, 1 + closest_power_of_2, device=device, dtype=torch.int32)
-        slopes = torch.pow(base, powers)
-        if closest_power_of_2 != num_heads:
-            extra_base = torch.tensor(2 ** (-(2 ** (-(math.log2(2 * closest_power_of_2) - 3)))), device=device, dtype=torch.float32)
-            num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
-            extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=device, dtype=torch.int32)
-            slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
-        qa = torch.arange(query_length, device=device, dtype=torch.int32).view(-1, 1)
-        ka = torch.arange(key_length, device=device, dtype=torch.int32).view(1, -1)
-        diffs = qa - ka + key_length - query_length
-        diffs = -diffs.abs()
-        alibi = slopes.view(1, num_heads, 1, 1) * diffs.view(1, 1, query_length, key_length)
-        alibi = alibi.expand(batch_size, -1, -1, -1).reshape(-1, query_length, key_length)
-        return alibi.to(dtype)
-    KeyValueT = Tuple[torch.Tensor, torch.Tensor]
-    def forward(
-        self: BloomModel,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[KeyValueT, ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        bidirectional_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
-        if deprecated_arguments.pop("position_ids", False) is not False:
-            warnings.warn(
-                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. " + "You can safely ignore passing `position_ids`.", FutureWarning
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            (batch_size, seq_length) = input_ids.shape
-        elif inputs_embeds is not None:
-            (batch_size, seq_length, _) = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-        if past_key_values is None:
-            past_key_values = tuple([None] * len(self.h))
-        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        hidden_states = self.word_embeddings_layernorm(inputs_embeds)
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-        if past_key_values[0] is not None:
-            tmp = past_key_values[0][0]
-            past_key_values_length = tmp.shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-        if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device)
-        else:
-            attention_mask = attention_mask.to(hidden_states.device)
-        alibi = self._build_alibi_tensor(
-            batch_size=batch_size, query_length=seq_length, key_length=seq_length_with_past, dtype=hidden_states.dtype, device=hidden_states.device
-        )
-        causal_mask = self._prepare_attn_mask(
-            attention_mask, bidirectional_mask, input_shape=(batch_size, seq_length), past_key_values_length=past_key_values_length
-        )
-        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-            if output_hidden_states:
-                hst = (hidden_states,)
-                all_hidden_states = all_hidden_states + hst
-            if self.gradient_checkpointing and self.training:
-                if use_cache:
-                    logger.warning("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
-                    use_cache = False
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs, use_cache=use_cache, output_attentions=output_attentions)
-                    return custom_forward
-                outputs = torch.utils.checkpoint.checkpoint(create_custom_forward(block), hidden_states, alibi, causal_mask, head_mask[i])
-            else:
-                outputs = block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=causal_mask,
-                    head_mask=head_mask[i],
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                    alibi=alibi,
-                )
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-            if output_attentions:
-                oa = (outputs[2 if use_cache else 1],)
-                all_self_attentions = all_self_attentions + oa
-        hidden_states = self.ln_f(hidden_states)
-        if output_hidden_states:
-            hst = (hidden_states,)
-            all_hidden_states = all_hidden_states + hst
-        if not return_dict:
-            return tuple((v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None))
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_self_attentions
-        )
-    setattr(model.transformer, "_prepare_attn_mask", MethodType(_prepare_attn_mask, model.transformer))
-    setattr(model.transformer, "_build_alibi_tensor", MethodType(_build_alibi_tensor, model.transformer))
-    setattr(model.transformer, "forward", MethodType(forward, model.transformer))
-    KeyValueT = Tuple[torch.Tensor, torch.Tensor]
-    def forward(
-        self: BloomForCausalLM,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[KeyValueT, ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        bidirectional_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
-        """Replacement forward method for BloomCausalLM."""
-        if deprecated_arguments.pop("position_ids", False) is not False:
-            warnings.warn(
-                "`position_ids` have no functionality in BLOOM and will be removed " + "in v5.0.0. You can safely ignore passing `position_ids`.", FutureWarning
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            bidirectional_mask=bidirectional_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        lm_logits = self.lm_head(hidden_states)
-        loss = None
-        if labels is not None:
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            (batch_size, seq_length, vocab_size) = shift_logits.shape
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length))
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return (loss,) + output if loss is not None else output
-        return CausalLMOutputWithCrossAttentions(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-    def prepare_inputs_for_generation(
-        self: BloomForCausalLM, input_ids: torch.LongTensor, past: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, **kwargs
-    ) -> dict:
-        if past:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-            bidirectional_mask = None
-            if past[0][0].shape[0] == input_ids.shape[0]:
-                past = self._convert_to_bloom_cache(past)
-        else:
-            bidirectional_mask = torch.ones_like(input_ids)
-        return {"input_ids": input_ids, "past_key_values": past, "use_cache": True, "attention_mask": attention_mask, "bidirectional_mask": bidirectional_mask}
-    setattr(model, "forward", MethodType(forward, model))
-    setattr(model, "prepare_inputs_for_generation", MethodType(prepare_inputs_for_generation, model))
-    setattr(model, "_prefix_lm_converted", True)
-    return model
-def _convert_opt_causal_lm_to_prefix_lm(model: OPTForCausalLM) -> OPTForCausalLM:
-    """Converts an OPT Causal LM to a Prefix LM.
-    Supported HuggingFace model classes:
-        - `OPTForCausalLM`
-    See `convert_hf_causal_lm_to_prefix_lm` for more details.
-    """
-    if hasattr(model, "_prefix_lm_converted"):
-        return model
-    assert isinstance(model, OPTForCausalLM)
-    assert model.config.add_cross_attention == False, "Only supports OPT decoder-only models"
-    setattr(model, "_original_forward", getattr(model, "forward"))
-    setattr(model, "_original_generate", getattr(model, "generate"))
-    model.model.decoder.bidirectional_mask = None
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            if self.bidirectional_mask == "g":
-                (bsz, src_length) = input_shape
-                combined_attention_mask = torch.zeros(
-                    (bsz, 1, src_length, src_length + past_key_values_length), dtype=inputs_embeds.dtype, device=inputs_embeds.device
-                )
-            else:
-                combined_attention_mask = _make_causal_mask_opt(input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length).to(
-                    inputs_embeds.device
-                )
-                if self.bidirectional_mask is not None:
-                    assert attention_mask.shape == self.bidirectional_mask.shape
-                    expanded_bidirectional_mask = _expand_mask_opt(self.bidirectional_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-                        inputs_embeds.device
-                    )
-                    combined_attention_mask = torch.maximum(expanded_bidirectional_mask, combined_attention_mask)
-        if attention_mask is not None:
-            expanded_attn_mask = _expand_mask_opt(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(inputs_embeds.device)
-            combined_attention_mask = expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
-        return combined_attention_mask
-    setattr(model.model.decoder, "_prepare_decoder_attention_mask", MethodType(_prepare_decoder_attention_mask, model.model.decoder))
-    def forward(
-        self: OPTForCausalLM,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        bidirectional_mask: Optional[torch.ByteTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        def call_og_forward():
-            return self._original_forward(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                head_mask=head_mask,
-                past_key_values=past_key_values,
-                inputs_embeds=inputs_embeds,
-                labels=labels,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        if bidirectional_mask is None:
-            return call_og_forward()
-        self.model.decoder.bidirectional_mask = bidirectional_mask
-        try:
-            outputs = call_og_forward()
-        except:
-            self.model.decoder.bidirectional_mask = None
-            raise
-        self.model.decoder.bidirectional_mask = None
-        return outputs
-    def generate(self: OPTForCausalLM, *args: tuple, **kwargs: Dict[str, Any]):
-        """Wraps original generate to enable PrefixLM-style attention."""
-        self.model.decoder.bidirectional_mask = "g"
-        try:
-            output = self._original_generate(*args, **kwargs)
-        except:
-            self.model.decoder.bidirectional_mask = None
-            raise
-        self.model.decoder.bidirectional_mask = None
-        return output
-    setattr(model, "forward", MethodType(forward, model))
-    setattr(model, "generate", MethodType(generate, model))
-    setattr(model, "_prefix_lm_converted", True)
-    return model
-_SUPPORTED_HF_MODELS = _SUPPORTED_GPT_MODELS + (BloomForCausalLM, OPTForCausalLM)
-CAUSAL_LM_TYPES = Union[GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM, BloomForCausalLM, OPTForCausalLM]
-def convert_hf_causal_lm_to_prefix_lm(model: CAUSAL_LM_TYPES) -> CAUSAL_LM_TYPES:
-    """Converts a HuggingFace Causal LM to a Prefix LM.
-    Supported HuggingFace model classes:
-        - `GPT2LMHeadModel`
-        - `GPTNeoForCausalLM`
-        - `GPTNeoXForCausalLM`
-        - `GPTJForCausalLM`
-        - `BloomForCausalLM`
-        - `OPTForCausalLM`
-    Conversion to a Prefix LM is done by modifying the `forward` method, and possibly also the
-    `generate` method and/or select underlying methods depending on the model class.
-    These changes preserve the model API, but add a new input to `forward`: "bidirectional_mask".
-    Notes on training:
-        To actually train the converted model as a Prefix LM, training batches will need to indicate
-        the prefix/target structure by including `bidirectional_mask` as part of the batch inputs.
-        **This is not a standard input and requires custom layers either within or after your dataloader.**
-        In addition to adding `bidirectional_mask` to the batch, this custom code should modify `labels`
-        such that `batch['labels'][batch['bidirectional_mask'] == 1] == -100`.
-        That is, the prefix portion of the sequence should not generate any loss. Loss should only be
-        generated by the target portion of the sequence.
-    Notes on `GPTNeoForCausalLM`:
-        To simplify the implementation, "global" and "local" attention layers are handled differently.
-        For "global" layers, we handle conversion as described above. For "local" layers, which use a
-        causal attention mask within a restricted local window, we do not alter the masking.
-    Notes on `forward` method conversion:
-        After conversion, the `forward` method will handle a new input, `bidirectional_mask`,
-        which should be a [batch_size, seq_length] byte tensor, where 1 indicates token positions
-        belonging to the prefix (prefix tokens can attend to one another bidirectionally), and
-        0 indicates token positions belonging to the target.
-        The new `forward` method will incorporate `bidirectional_mask` (if supplied) into the existing
-        causal mask, call the original `forward` method, and (if the causal mask is a buffer) reset
-        the causal masks before returning the result.
-    Notes on `generate` method conversion:
-        After conversion, the `generate` method will have the same signature but will internally
-        convert all causal masks to be purely bidirectional, call the original `generate` method, and
-        (where appropriate) reset the causal masks before returning the result.
-        This works thanks to the logic of the HuggingFace `generate` API, which first encodes the token
-        "prompt" passed to `generate` (which is treated as the prefix) and then sequentially generates
-        each new token. Encodings are cached as generation happens, so all prefix tokens can attend to one
-        another (as expected in a Prefix LM) and generated tokens can only attend to prefix tokens and
-        previously-generated tokens (also as expected in a Prefix LM).
-    To preserve the API, the original methods are renamed to `_original_forward` and
-    `_original_generate`, and replaced with new `forward` and `generate` methods that wrap
-    them, respectively. Although implementation details vary by model class.
-    """
-    if isinstance(model, _SUPPORTED_GPT_MODELS):
-        return _convert_gpt_causal_lm_to_prefix_lm(model)
-    elif isinstance(model, BloomForCausalLM):
-        return _convert_bloom_causal_lm_to_prefix_lm(model)
-    elif isinstance(model, OPTForCausalLM):
-        return _convert_opt_causal_lm_to_prefix_lm(model)
-    else:
-        raise TypeError(f"Cannot convert model to Prefix LM. " + f"Model does not belong to set of supported HF models:" + f"\n{_SUPPORTED_HF_MODELS}")
-def add_bidirectional_mask_if_missing(batch: Dict[str, Any]):
-    """Attempts to add bidirectional_mask to batch if missing.
-    Raises:
-        KeyError if bidirectional_mask is missing and can't be inferred
-    """
-    if "bidirectional_mask" not in batch:
-        if batch.get("mode", None) == "icl_task":
-            batch["bidirectional_mask"] = batch["attention_mask"].clone()
-            for i, continuation_indices in enumerate(batch["continuation_indices"]):
-                batch["bidirectional_mask"][i, continuation_indices] = 0
-        elif "labels" in batch and "attention_mask" in batch:
-            batch["bidirectional_mask"] = torch.logical_and(torch.eq(batch["attention_mask"], 1), torch.eq(batch["labels"], -100)).type_as(
-                batch["attention_mask"]
-            )
-        else:
-            raise KeyError("No bidirectional_mask in batch and not sure how to construct one.")

mllm/flamingo/mpt/meta_init_context.py DELETED Viewed

@@ -1,98 +0,0 @@
-from contextlib import contextmanager
-import torch
-import torch.nn as nn
-@contextmanager
-def init_empty_weights(include_buffers: bool = False):
-    """Meta initialization context manager.
-    A context manager under which models are initialized with all parameters
-    on the meta device, therefore creating an empty model. Useful when just
-    initializing the model would blow the available RAM.
-    Args:
-        include_buffers (`bool`, *optional*, defaults to `False`): Whether or
-            not to also put all buffers on the meta device while initializing.
-    Example:
-    ```python
-    import torch.nn as nn
-    # Initialize a model with 100 billions parameters in no time and without using any RAM.
-    with init_empty_weights():
-        tst = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
-    ```
-    <Tip warning={true}>
-    Any model created under this context manager has no weights. As such you can't do something like
-    `model.to(some_device)` with it. To load weights inside your empty model, see [`load_checkpoint_and_dispatch`].
-    </Tip>
-    """
-    with init_on_device(torch.device("meta"), include_buffers=include_buffers) as f:
-        yield f
-@contextmanager
-def init_on_device(device: torch.device, include_buffers: bool = False):
-    """Device initialization context manager.
-    A context manager under which models are initialized with all parameters
-    on the specified device.
-    Args:
-        device (`torch.device`): Device to initialize all parameters on.
-        include_buffers (`bool`, *optional*, defaults to `False`): Whether or
-            not to also put all buffers on the meta device while initializing.
-    Example:
-    ```python
-    import torch.nn as nn
-    with init_on_device(device=torch.device("cuda")):
-        tst = nn.Liner(100, 100)  # on `cuda` device
-    ```
-    """
-    old_register_parameter = nn.Module.register_parameter
-    if include_buffers:
-        old_register_buffer = nn.Module.register_buffer
-    def register_empty_parameter(module, name, param):
-        old_register_parameter(module, name, param)
-        if param is not None:
-            param_cls = type(module._parameters[name])
-            kwargs = module._parameters[name].__dict__
-            module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
-    def register_empty_buffer(module, name, buffer):
-        old_register_buffer(module, name, buffer)
-        if buffer is not None:
-            module._buffers[name] = module._buffers[name].to(device)
-    if include_buffers:
-        tensor_constructors_to_patch = {torch_function_name: getattr(torch, torch_function_name) for torch_function_name in ["empty", "zeros", "ones", "full"]}
-    else:
-        tensor_constructors_to_patch = {}
-    def patch_tensor_constructor(fn):
-        def wrapper(*args, **kwargs):
-            kwargs["device"] = device
-            return fn(*args, **kwargs)
-        return wrapper
-    try:
-        nn.Module.register_parameter = register_empty_parameter
-        if include_buffers:
-            nn.Module.register_buffer = register_empty_buffer
-        for torch_function_name in tensor_constructors_to_patch.keys():
-            setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
-        yield
-    finally:
-        nn.Module.register_parameter = old_register_parameter
-        if include_buffers:
-            nn.Module.register_buffer = old_register_buffer
-        for torch_function_name, old_torch_function in tensor_constructors_to_patch.items():
-            setattr(torch, torch_function_name, old_torch_function)

mllm/flamingo/mpt/modeling_mpt.py DELETED Viewed

@@ -1,496 +0,0 @@
-"""A simple, flexible implementation of a GPT model.
-Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
-"""
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from .attention import attn_bias_shape, build_attn_bias
-from .blocks import MPTBlock
-from .configuration_mpt import MPTConfig
-from .custom_embedding import SharedEmbedding
-from .norm import NORM_CLASS_REGISTRY
-from .param_init_fns import MODEL_INIT_REGISTRY, generic_param_init_fn_
-import torch.distributed as dist
-try:
-    from .flash_attn_triton import flash_attn_func
-except:
-    pass
-Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
-class MPTPreTrainedModel(PreTrainedModel):
-    config_class = MPTConfig
-    base_model_prefix = "model"
-    _no_split_modules = ["MPTBlock"]
-class MPTModel(MPTPreTrainedModel):
-    def __init__(self, config: MPTConfig):
-        config._validate_config()
-        super().__init__(config)
-        self.attn_impl = config.attn_config["attn_impl"]
-        self.prefix_lm = config.attn_config["prefix_lm"]
-        self.attn_uses_sequence_id = config.attn_config["attn_uses_sequence_id"]
-        self.alibi = config.attn_config["alibi"]
-        self.alibi_bias_max = config.attn_config["alibi_bias_max"]
-        if config.init_device == "mixed":
-            if dist.get_local_rank() == 0:
-                config.init_device = "cpu"
-            else:
-                config.init_device = "meta"
-        if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
-            norm_options = " | ".join(NORM_CLASS_REGISTRY.keys())
-            raise NotImplementedError(f"Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).")
-        norm_class = NORM_CLASS_REGISTRY[config.norm_type.lower()]
-        self.embedding_fraction = config.embedding_fraction
-        self.wte = SharedEmbedding(config.vocab_size, config.d_model, device=config.init_device)
-        if not self.alibi:
-            self.wpe = torch.nn.Embedding(config.max_seq_len, config.d_model, device=config.init_device)
-        self.emb_drop = nn.Dropout(config.emb_pdrop)
-        self.blocks = nn.ModuleList([MPTBlock(device=config.init_device, **config.to_dict()) for _ in range(config.n_layers)])
-        self.norm_f = norm_class(config.d_model, device=config.init_device)
-        if config.init_device != "meta":
-            print(
-                f'You are using config.init_device={config.init_device!r}, but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.'
-            )
-            self.apply(self.param_init_fn)
-        self.is_causal = not self.prefix_lm
-        self._attn_bias_initialized = False
-        self.attn_bias = None
-        self.attn_bias_shape = attn_bias_shape(
-            self.attn_impl,
-            config.n_heads,
-            config.max_seq_len,
-            self.alibi,
-            prefix_lm=self.prefix_lm,
-            causal=self.is_causal,
-            use_sequence_id=self.attn_uses_sequence_id,
-        )
-        if config.no_bias:
-            for module in self.modules():
-                if hasattr(module, "bias") and isinstance(module.bias, nn.Parameter):
-                    if config.verbose:
-                        warnings.warn(f"Removing bias ({module.bias}) from {module}.")
-                    module.register_parameter("bias", None)
-        if config.verbose and config.verbose > 2:
-            print(self)
-        if "verbose" not in self.config.init_config:
-            self.config.init_config["verbose"] = self.config.verbose
-        if self.config.init_config["verbose"] > 1:
-            init_fn_name = self.config.init_config["name"]
-            warnings.warn(f"Using {init_fn_name} initialization.")
-    def get_input_embeddings(self):
-        return self.wte
-    def set_input_embeddings(self, value):
-        self.wte = value
-    @torch.no_grad()
-    def _attn_bias(
-        self,
-        device,
-        dtype,
-        attention_mask: Optional[torch.ByteTensor] = None,
-        prefix_mask: Optional[torch.ByteTensor] = None,
-        sequence_id: Optional[torch.LongTensor] = None,
-    ):
-        if not self._attn_bias_initialized:
-            if self.attn_bias_shape:
-                self.attn_bias = torch.zeros(self.attn_bias_shape, device=device, dtype=dtype)
-                self.attn_bias = build_attn_bias(
-                    self.attn_impl,
-                    self.attn_bias,
-                    self.config.n_heads,
-                    self.config.max_seq_len,
-                    causal=self.is_causal,
-                    alibi=self.alibi,
-                    alibi_bias_max=self.alibi_bias_max,
-                )
-            self._attn_bias_initialized = True
-        if self.attn_impl == "flash":
-            return (self.attn_bias, attention_mask)
-        if self.attn_bias is not None:
-            self.attn_bias = self.attn_bias.to(dtype=dtype, device=device)
-        attn_bias = self.attn_bias
-        if self.prefix_lm:
-            assert isinstance(attn_bias, torch.Tensor)
-            assert isinstance(prefix_mask, torch.Tensor)
-            attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask)
-        if self.attn_uses_sequence_id and sequence_id is not None:
-            assert isinstance(attn_bias, torch.Tensor)
-            attn_bias = self._apply_sequence_id(attn_bias, sequence_id)
-        if attention_mask is not None:
-            s_k = attention_mask.shape[-1]
-            if attn_bias is None:
-                attn_bias = torch.zeros((1, 1, 1, s_k), device=device, dtype=dtype)
-            else:
-                _s_k = max(0, attn_bias.size(-1) - s_k)
-                attn_bias = attn_bias[:, :, :, _s_k:]
-            if prefix_mask is not None and attention_mask.shape != prefix_mask.shape:
-                raise ValueError(f"attention_mask shape={attention_mask.shape} " + f"and prefix_mask shape={prefix_mask.shape} are not equal.")
-            min_val = torch.finfo(attn_bias.dtype).min
-            attn_bias = attn_bias.masked_fill(~attention_mask.view(-1, 1, 1, s_k), min_val)
-        return (attn_bias, None)
-    def _apply_prefix_mask(self, attn_bias: torch.Tensor, prefix_mask: torch.Tensor):
-        (s_k, s_q) = attn_bias.shape[-2:]
-        if s_k != self.config.max_seq_len or s_q != self.config.max_seq_len:
-            raise ValueError(
-                "attn_bias does not match the expected shape. "
-                + f"The last two dimensions should both be {self.config.max_length} "
-                + f"but are {s_k} and {s_q}."
-            )
-        seq_len = prefix_mask.shape[-1]
-        if seq_len > self.config.max_seq_len:
-            raise ValueError(f"prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}")
-        attn_bias = attn_bias[..., :seq_len, :seq_len]
-        causal = torch.tril(torch.ones((seq_len, seq_len), dtype=torch.bool, device=prefix_mask.device)).view(1, 1, seq_len, seq_len)
-        prefix = prefix_mask.view(-1, 1, 1, seq_len)
-        cannot_attend = ~torch.logical_or(causal, prefix.bool())
-        min_val = torch.finfo(attn_bias.dtype).min
-        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
-        return attn_bias
-    def _apply_sequence_id(self, attn_bias: torch.Tensor, sequence_id: torch.LongTensor):
-        seq_len = sequence_id.shape[-1]
-        if seq_len > self.config.max_seq_len:
-            raise ValueError(f"sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}")
-        attn_bias = attn_bias[..., :seq_len, :seq_len]
-        cannot_attend = torch.logical_not(torch.eq(sequence_id.view(-1, seq_len, 1), sequence_id.view(-1, 1, seq_len))).unsqueeze(1)
-        min_val = torch.finfo(attn_bias.dtype).min
-        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
-        return attn_bias
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
-        attention_mask: Optional[torch.ByteTensor] = None,
-        prefix_mask: Optional[torch.ByteTensor] = None,
-        sequence_id: Optional[torch.LongTensor] = None,
-        return_dict: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        use_cache: Optional[bool] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ):
-        return_dict = return_dict if return_dict is not None else self.config.return_dict
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        if attention_mask is not None:
-            attention_mask = attention_mask.bool()
-        if prefix_mask is not None:
-            prefix_mask = prefix_mask.bool()
-        # These args are passed in by keyword in huggingface's generate function
-        # https://github.com/huggingface/transformers/blob/68287689f2f0d8b7063c400230b3766987abf18d/src/transformers/generation/utils.py#L2201-L2206
-        # but have not yet been fully implemented in MPTModel
-        if not return_dict:
-            raise NotImplementedError("return_dict False is not implemented yet for MPT")
-        if output_attentions:
-            if self.attn_impl != "torch":
-                raise NotImplementedError("output_attentions is not implemented for MPT when using attn_impl `flash` or `triton`.")
-        if attention_mask is not None and attention_mask[:, 0].sum() != attention_mask.shape[0] and self.training:
-            raise NotImplementedError("MPT does not support training with left padding.")
-        if self.prefix_lm and prefix_mask is None:
-            raise ValueError("prefix_mask is a required argument when MPT is configured with prefix_lm=True.")
-        # Raise a not implemented error if input_embeds is not None (this is an arg in huggingface transformers and we need to support it for PEFT)
-        if inputs_embeds is not None:
-            raise NotImplementedError("inputs_embeds is not implemented for MPT.")
-        if self.training:
-            if self.attn_uses_sequence_id and sequence_id is None:
-                raise ValueError(
-                    "sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True " + "and the model is in train mode."
-                )
-            elif (self.attn_uses_sequence_id is False) and (sequence_id is not None):
-                warnings.warn(
-                    "MPT received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. "
-                    + "This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True."
-                )
-        S = input_ids.size(1)
-        assert S <= self.config.max_seq_len, f"Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}"
-        tok_emb = self.wte(input_ids)  # type: ignore
-        if self.alibi:
-            x = tok_emb
-        else:
-            past_position = 0
-            if past_key_values is not None:
-                if len(past_key_values) != self.config.n_layers:
-                    raise ValueError(
-                        f"past_key_values must provide a past_key_value for each attention "
-                        + f"layer in the network ({len(past_key_values)=}; {self.config.n_layers=})."
-                    )
-                # For attn_impl: triton and flash the past key tensor spec is (batch, seq, dim).
-                # For attn_impl: torch the past key tensor spec is (batch, heads, head_dim, seq).
-                # Here we shift position embedding using the `seq` dim of the past key
-                past_position = past_key_values[0][0].size(1)
-                if self.attn_impl == "torch":
-                    past_position = past_key_values[0][0].size(3)
-            if S + past_position > self.config.max_seq_len:
-                raise ValueError(
-                    f"Cannot forward input with past sequence length {past_position} and current sequence length "
-                    f"{S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}."
-                )
-            pos = torch.arange(
-                past_position,
-                S + past_position,
-                dtype=torch.long,
-                device=input_ids.device,
-            ).unsqueeze(0)
-            if attention_mask is not None:
-                # adjust the position indices to account for padding tokens
-                pos = torch.clamp(
-                    pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[:, past_position:],
-                    min=0,
-                )
-            pos_emb = self.wpe(pos)  # type: ignore
-            x = tok_emb + pos_emb
-        if self.embedding_fraction == 1:
-            x = self.emb_drop(x)  # type: ignore
-        else:
-            # this implementation is proposed on page 7 of the GLM-130B paper https://arxiv.org/abs/2210.02414
-            x_shrunk = (x * self.embedding_fraction) + (x.detach() * (1 - self.embedding_fraction))
-            assert isinstance(self.emb_drop, nn.Module)  # pyright
-            x = self.emb_drop(x_shrunk)
-        attn_bias, attention_mask = self._attn_bias(
-            device=x.device,
-            dtype=torch.float32,
-            attention_mask=attention_mask,
-            prefix_mask=prefix_mask,
-            sequence_id=sequence_id,
-        )
-        # initialize the past key values cache if it should be used
-        if use_cache and past_key_values is None:
-            past_key_values = [() for _ in range(self.config.n_layers)]  # type: ignore
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        for b_idx, block in enumerate(self.blocks):  # type: ignore
-            if output_hidden_states:
-                assert all_hidden_states is not None  # pyright
-                all_hidden_states = all_hidden_states + (x,)
-            past_key_value = past_key_values[b_idx] if past_key_values is not None else None
-            x, attn_weights, past_key_value = block(
-                x,
-                past_key_value=past_key_value,
-                attn_bias=attn_bias,
-                attention_mask=attention_mask,
-                is_causal=self.is_causal,
-            )
-            if past_key_values is not None:
-                past_key_values[b_idx] = past_key_value
-            if output_attentions:
-                assert all_self_attns is not None  # pyright
-                all_self_attns = all_self_attns + (attn_weights,)
-        x = self.norm_f(x)  # type: ignore
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            assert all_hidden_states is not None  # pyright
-            all_hidden_states = all_hidden_states + (x,)
-        return BaseModelOutputWithPast(
-            last_hidden_state=x,
-            past_key_values=past_key_values,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-    # Param Initialization, needed for device='meta' fast initialization
-    def param_init_fn(self, module):
-        init_fn_name = self.config.init_config["name"]
-        MODEL_INIT_REGISTRY[init_fn_name](
-            module=module,
-            n_layers=self.config.n_layers,
-            d_model=self.config.d_model,
-            **self.config.init_config,
-        )
-    def fsdp_wrap_fn(self, module):
-        return isinstance(module, MPTBlock)
-    def activation_checkpointing_fn(self, module):
-        return isinstance(module, MPTBlock)
-class MPTForCausalLM(MPTPreTrainedModel):
-    def __init__(self, config: MPTConfig):
-        super().__init__(config)
-        if not config.tie_word_embeddings:
-            raise ValueError("MPTForCausalLM only supports tied word embeddings")
-        self.transformer = MPTModel(config)
-        for child in self.transformer.children():
-            if isinstance(child, torch.nn.ModuleList):
-                continue
-            if isinstance(child, torch.nn.Module):
-                child._fsdp_wrap = True
-        self.logit_scale = None
-        if config.logit_scale is not None:
-            logit_scale = config.logit_scale
-            if isinstance(logit_scale, str):
-                if logit_scale == "inv_sqrt_d_model":
-                    logit_scale = 1 / math.sqrt(config.d_model)
-                else:
-                    raise ValueError(f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
-            self.logit_scale = logit_scale
-    def get_input_embeddings(self):
-        return self.transformer.wte
-    def set_input_embeddings(self, value):
-        # self.transformer.wte = value
-        peudo_wte = SharedEmbedding(value.weight.shape[0], value.weight.shape[1], device=self.transformer.wte.weight.device)
-        peudo_wte.weight = value.weight
-        self.transformer.wte = peudo_wte
-    def get_output_embeddings(self):
-        return self.transformer.wte
-    def set_output_embeddings(self, new_embeddings):
-        # self.transformer.wte = new_embeddings
-        peudo_wte = SharedEmbedding(new_embeddings.weight.shape[0], new_embeddings.weight.shape[1], device=self.transformer.wte.weight.device)
-        peudo_wte.weight = new_embeddings.weight
-        self.transformer.wte = peudo_wte
-    def set_decoder(self, decoder):
-        self.transformer = decoder
-    def get_decoder(self):
-        return self.transformer
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
-        attention_mask: Optional[torch.ByteTensor] = None,
-        prefix_mask: Optional[torch.ByteTensor] = None,
-        sequence_id: Optional[torch.LongTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        return_dict: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        use_cache: Optional[bool] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-    ):
-        return_dict = return_dict if return_dict is not None else self.config.return_dict
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        # if input_embeds is not none, raise a not implemented error
-        if inputs_embeds is not None:
-            raise NotImplementedError("inputs_embeds has to be None (for hf/peft support).")
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.transformer(
-            input_ids=input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            prefix_mask=prefix_mask,
-            sequence_id=sequence_id,
-            return_dict=return_dict,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            use_cache=use_cache,
-        )
-        # move outputs to same device as weights for token embedding
-        # needed to support HF `device_map`
-        logits = self.transformer.wte(
-            input=outputs.last_hidden_state.to(self.transformer.wte.weight.device),
-            unembed=True,
-        )
-        if self.logit_scale is not None:
-            if self.logit_scale == 0:
-                warnings.warn(f"Multiplying logits by {self.logit_scale=}. This will produce uniform (uninformative) outputs.")
-            logits *= self.logit_scale
-        loss = None
-        if labels is not None:
-            _labels = torch.roll(labels, shifts=-1)
-            _labels[:, -1] = -100
-            loss = F.cross_entropy(
-                logits.view(-1, logits.size(-1)),
-                _labels.to(logits.device).view(-1),
-            )
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-    def param_init_fn(self, module):
-        init_fn_name = self.config.init_config["name"]
-        MODEL_INIT_REGISTRY[init_fn_name](module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config)
-    def fsdp_wrap_fn(self, module):
-        return isinstance(module, MPTBlock)
-    def activation_checkpointing_fn(self, module):
-        return isinstance(module, MPTBlock)
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None, **kwargs):
-        if inputs_embeds is not None:
-            raise NotImplementedError("inputs_embeds is not implemented for MPT yet")
-        attention_mask = attention_mask.bool()
-        if attention_mask[:, -1].sum() != attention_mask.shape[0]:
-            raise NotImplementedError("MPT does not support generation with right padding.")
-        if self.transformer.attn_uses_sequence_id and self.training:
-            sequence_id = torch.zeros_like(input_ids[:1])
-        else:
-            sequence_id = None
-        if past_key_values is not None:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-        if self.transformer.prefix_lm:
-            prefix_mask = torch.ones_like(attention_mask)
-            if kwargs.get("use_cache") == False:
-                raise NotImplementedError("MPT with prefix_lm=True does not support use_cache=False.")
-        else:
-            prefix_mask = None
-        return {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "prefix_mask": prefix_mask,
-            "sequence_id": sequence_id,
-            "past_key_values": past_key_values,
-            "use_cache": kwargs.get("use_cache", True),
-        }
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        """Used by HuggingFace generate when using beam search with kv-caching.
-        See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133
-        for an example in transformers.
-        """
-        reordered_past = []
-        for layer_past in past_key_values:
-            reordered_past += [tuple((past_state.index_select(0, beam_idx) for past_state in layer_past))]
-        return reordered_past

mllm/flamingo/mpt/norm.py DELETED Viewed

@@ -1,60 +0,0 @@
-import torch
-def _cast_if_autocast_enabled(tensor):
-    if torch.is_autocast_enabled():
-        if tensor.device.type == "cuda":
-            dtype = torch.get_autocast_gpu_dtype()
-        elif tensor.device.type == "cpu":
-            dtype = torch.get_autocast_cpu_dtype()
-        else:
-            raise NotImplementedError()
-        return tensor.to(dtype=dtype)
-    return tensor
-class LPLayerNorm(torch.nn.LayerNorm):
-    def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None):
-        super().__init__(normalized_shape=normalized_shape, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
-    def forward(self, x):
-        module_device = x.device
-        downcast_x = _cast_if_autocast_enabled(x)
-        downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
-        downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
-        with torch.autocast(enabled=False, device_type=module_device.type):
-            return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
-def rms_norm(x, weight=None, eps=1e-05):
-    output = x / torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
-    if weight is not None:
-        return output * weight
-    return output
-class RMSNorm(torch.nn.Module):
-    def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
-        super().__init__()
-        self.eps = eps
-        if weight:
-            self.weight = torch.nn.Parameter(torch.ones(normalized_shape, dtype=dtype, device=device))
-        else:
-            self.register_parameter("weight", None)
-    def forward(self, x):
-        return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype)
-class LPRMSNorm(RMSNorm):
-    def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
-        super().__init__(normalized_shape=normalized_shape, eps=eps, weight=weight, dtype=dtype, device=device)
-    def forward(self, x):
-        downcast_x = _cast_if_autocast_enabled(x)
-        downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
-        with torch.autocast(enabled=False, device_type=x.device.type):
-            return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
-NORM_CLASS_REGISTRY = {"layernorm": torch.nn.LayerNorm, "low_precision_layernorm": LPLayerNorm, "rmsnorm": RMSNorm, "low_precision_rmsnorm": LPRMSNorm}

mllm/flamingo/mpt/param_init_fns.py DELETED Viewed

@@ -1,369 +0,0 @@
-import math
-import warnings
-from collections.abc import Sequence
-from functools import partial
-from typing import Optional, Tuple, Union
-import torch
-from torch import nn
-from .norm import NORM_CLASS_REGISTRY
-def torch_default_param_init_fn_(module: nn.Module, verbose: int = 0, **kwargs):
-    del kwargs
-    if verbose > 1:
-        warnings.warn(f"Initializing network using module's reset_parameters attribute")
-    if hasattr(module, "reset_parameters"):
-        module.reset_parameters()
-def fused_init_helper_(module: nn.Module, init_fn_):
-    _fused = getattr(module, "_fused", None)
-    if _fused is None:
-        raise RuntimeError(f"Internal logic error")
-    (dim, splits) = _fused
-    splits = (0, *splits, module.weight.size(dim))
-    for s, e in zip(splits[:-1], splits[1:]):
-        slice_indices = [slice(None)] * module.weight.ndim
-        slice_indices[dim] = slice(s, e)
-        init_fn_(module.weight[slice_indices])
-def generic_param_init_fn_(
-    module: nn.Module,
-    init_fn_,
-    n_layers: int,
-    d_model: Optional[int] = None,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs
-    if verbose > 1:
-        warnings.warn(f"If model has bias parameters they are initialized to 0.")
-    init_div_is_residual = init_div_is_residual
-    if init_div_is_residual is False:
-        div_is_residual = 1.0
-    elif init_div_is_residual is True:
-        div_is_residual = math.sqrt(2 * n_layers)
-    elif isinstance(init_div_is_residual, float) or isinstance(init_div_is_residual, int):
-        div_is_residual = init_div_is_residual
-    elif isinstance(init_div_is_residual, str) and init_div_is_residual.isnumeric():
-        div_is_residual = float(init_div_is_residual)
-    else:
-        div_is_residual = 1.0
-        raise ValueError(f"Expected init_div_is_residual to be boolean or numeric, got {init_div_is_residual}")
-    if init_div_is_residual is not False:
-        if verbose > 1:
-            warnings.warn(
-                f"Initializing _is_residual layers then dividing them by {div_is_residual:.3f}. "
-                + f"Set `init_div_is_residual: false` in init config to disable this."
-            )
-    if isinstance(module, nn.Linear):
-        if hasattr(module, "_fused"):
-            fused_init_helper_(module, init_fn_)
-        else:
-            init_fn_(module.weight)
-        if module.bias is not None:
-            torch.nn.init.zeros_(module.bias)
-        if init_div_is_residual is not False and getattr(module, "_is_residual", False):
-            with torch.no_grad():
-                module.weight.div_(div_is_residual)
-    elif isinstance(module, nn.Embedding):
-        if emb_init_std is not None:
-            std = emb_init_std
-            if std == 0:
-                warnings.warn(f"Embedding layer initialized to 0.")
-            emb_init_fn_ = partial(torch.nn.init.normal_, mean=0.0, std=std)
-            if verbose > 1:
-                warnings.warn(f"Embedding layer initialized using normal distribution with mean=0 and std={std!r}.")
-        elif emb_init_uniform_lim is not None:
-            lim = emb_init_uniform_lim
-            if isinstance(lim, Sequence):
-                if len(lim) > 2:
-                    raise ValueError(f"Uniform init requires a min and a max limit. User input: {lim}.")
-                if lim[0] == lim[1]:
-                    warnings.warn(f"Embedding layer initialized to {lim[0]}.")
-            else:
-                if lim == 0:
-                    warnings.warn(f"Embedding layer initialized to 0.")
-                lim = [-lim, lim]
-            (a, b) = lim
-            emb_init_fn_ = partial(torch.nn.init.uniform_, a=a, b=b)
-            if verbose > 1:
-                warnings.warn(f"Embedding layer initialized using uniform distribution in range {lim}.")
-        else:
-            emb_init_fn_ = init_fn_
-        emb_init_fn_(module.weight)
-    elif isinstance(module, tuple(set(NORM_CLASS_REGISTRY.values()))):
-        if verbose > 1:
-            warnings.warn(f"Norm weights are set to 1. If norm layer has a bias it is initialized to 0.")
-        if hasattr(module, "weight") and module.weight is not None:
-            torch.nn.init.ones_(module.weight)
-        if hasattr(module, "bias") and module.bias is not None:
-            torch.nn.init.zeros_(module.bias)
-    elif isinstance(module, nn.MultiheadAttention):
-        if module._qkv_same_embed_dim:
-            assert module.in_proj_weight is not None
-            assert module.q_proj_weight is None and module.k_proj_weight is None and (module.v_proj_weight is None)
-            assert d_model is not None
-            _d = d_model
-            splits = (0, _d, 2 * _d, 3 * _d)
-            for s, e in zip(splits[:-1], splits[1:]):
-                init_fn_(module.in_proj_weight[s:e])
-        else:
-            assert module.q_proj_weight is not None and module.k_proj_weight is not None and (module.v_proj_weight is not None)
-            assert module.in_proj_weight is None
-            init_fn_(module.q_proj_weight)
-            init_fn_(module.k_proj_weight)
-            init_fn_(module.v_proj_weight)
-        if module.in_proj_bias is not None:
-            torch.nn.init.zeros_(module.in_proj_bias)
-        if module.bias_k is not None:
-            torch.nn.init.zeros_(module.bias_k)
-        if module.bias_v is not None:
-            torch.nn.init.zeros_(module.bias_v)
-        init_fn_(module.out_proj.weight)
-        if init_div_is_residual is not False and getattr(module.out_proj, "_is_residual", False):
-            with torch.no_grad():
-                module.out_proj.weight.div_(div_is_residual)
-        if module.out_proj.bias is not None:
-            torch.nn.init.zeros_(module.out_proj.bias)
-    else:
-        for _ in module.parameters(recurse=False):
-            raise NotImplementedError(f"{module.__class__.__name__} parameters are not initialized by param_init_fn.")
-def _normal_init_(std, mean=0.0):
-    return partial(torch.nn.init.normal_, mean=mean, std=std)
-def _normal_param_init_fn_(
-    module: nn.Module,
-    std: float,
-    n_layers: int,
-    d_model: Optional[int] = None,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs
-    init_fn_ = _normal_init_(std=std)
-    if verbose > 1:
-        warnings.warn(f"Using torch.nn.init.normal_ init fn mean=0.0, std={std}")
-    generic_param_init_fn_(
-        module=module,
-        init_fn_=init_fn_,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=init_div_is_residual,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-def baseline_param_init_fn_(
-    module: nn.Module,
-    init_std: float,
-    n_layers: int,
-    d_model: Optional[int] = None,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs
-    if init_std is None:
-        raise ValueError("You must set model.init_config['init_std'] to a float value to use the default initialization scheme.")
-    _normal_param_init_fn_(
-        module=module,
-        std=init_std,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=init_div_is_residual,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-def small_param_init_fn_(
-    module: nn.Module,
-    n_layers: int,
-    d_model: int,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs
-    std = math.sqrt(2 / (5 * d_model))
-    _normal_param_init_fn_(
-        module=module,
-        std=std,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=init_div_is_residual,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-def neox_param_init_fn_(
-    module: nn.Module,
-    n_layers: int,
-    d_model: int,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    verbose: int = 0,
-    **kwargs,
-):
-    """From section 2.3.1 of GPT-NeoX-20B:
-    An Open-Source AutoregressiveLanguage Model — Black et. al. (2022)
-    see https://github.com/EleutherAI/gpt-neox/blob/9610391ab319403cef079b438edd016a2443af54/megatron/model/init_functions.py#L151
-    and https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/transformer.py
-    """
-    del kwargs
-    residual_div = n_layers / math.sqrt(10)
-    if verbose > 1:
-        warnings.warn(f"setting init_div_is_residual to {residual_div}")
-    small_param_init_fn_(
-        module=module,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=residual_div,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-def kaiming_uniform_param_init_fn_(
-    module: nn.Module,
-    n_layers: int,
-    d_model: Optional[int] = None,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    init_gain: float = 0,
-    fan_mode: str = "fan_in",
-    init_nonlinearity: str = "leaky_relu",
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs
-    if verbose > 1:
-        warnings.warn(f"Using nn.init.kaiming_uniform_ init fn with parameters: " + f"a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}")
-    kaiming_uniform_ = partial(nn.init.kaiming_uniform_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
-    generic_param_init_fn_(
-        module=module,
-        init_fn_=kaiming_uniform_,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=init_div_is_residual,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-def kaiming_normal_param_init_fn_(
-    module: nn.Module,
-    n_layers: int,
-    d_model: Optional[int] = None,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    init_gain: float = 0,
-    fan_mode: str = "fan_in",
-    init_nonlinearity: str = "leaky_relu",
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs
-    if verbose > 1:
-        warnings.warn(f"Using nn.init.kaiming_normal_ init fn with parameters: " + f"a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}")
-    kaiming_normal_ = partial(torch.nn.init.kaiming_normal_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
-    generic_param_init_fn_(
-        module=module,
-        init_fn_=kaiming_normal_,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=init_div_is_residual,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-def xavier_uniform_param_init_fn_(
-    module: nn.Module,
-    n_layers: int,
-    d_model: Optional[int] = None,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    init_gain: float = 0,
-    verbose: int = 0,
-    **kwargs,
-):
-    del kwargs
-    xavier_uniform_ = partial(torch.nn.init.xavier_uniform_, gain=init_gain)
-    if verbose > 1:
-        warnings.warn(f"Using torch.nn.init.xavier_uniform_ init fn with parameters: " + f"gain={init_gain}")
-    generic_param_init_fn_(
-        module=module,
-        init_fn_=xavier_uniform_,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=init_div_is_residual,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-def xavier_normal_param_init_fn_(
-    module: nn.Module,
-    n_layers: int,
-    d_model: Optional[int] = None,
-    init_div_is_residual: Union[int, float, str, bool] = True,
-    emb_init_std: Optional[float] = None,
-    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
-    init_gain: float = 0,
-    verbose: int = 0,
-    **kwargs,
-):
-    xavier_normal_ = partial(torch.nn.init.xavier_normal_, gain=init_gain)
-    if verbose > 1:
-        warnings.warn(f"Using torch.nn.init.xavier_normal_ init fn with parameters: " + f"gain={init_gain}")
-    generic_param_init_fn_(
-        module=module,
-        init_fn_=xavier_normal_,
-        d_model=d_model,
-        n_layers=n_layers,
-        init_div_is_residual=init_div_is_residual,
-        emb_init_std=emb_init_std,
-        emb_init_uniform_lim=emb_init_uniform_lim,
-        verbose=verbose,
-    )
-MODEL_INIT_REGISTRY = {
-    "default_": torch_default_param_init_fn_,
-    "baseline_": baseline_param_init_fn_,
-    "kaiming_uniform_": kaiming_uniform_param_init_fn_,
-    "kaiming_normal_": kaiming_normal_param_init_fn_,
-    "neox_init_": neox_param_init_fn_,
-    "small_init_": small_param_init_fn_,
-    "xavier_uniform_": xavier_uniform_param_init_fn_,
-    "xavier_normal_": xavier_normal_param_init_fn_,
-}

mllm/flamingo/mpt_redpajama/__init__.py DELETED Viewed

File without changes

mllm/flamingo/mpt_redpajama/__pycache__/__init__.cpython-39.pyc DELETED Viewed

Binary file (216 Bytes)