Instructions to use normalcomputing/extended-mind-mpt-7b with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use normalcomputing/extended-mind-mpt-7b with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="normalcomputing/extended-mind-mpt-7b", trust_remote_code=True)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("normalcomputing/extended-mind-mpt-7b", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use normalcomputing/extended-mind-mpt-7b with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "normalcomputing/extended-mind-mpt-7b"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "normalcomputing/extended-mind-mpt-7b",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/normalcomputing/extended-mind-mpt-7b

SGLang

How to use normalcomputing/extended-mind-mpt-7b with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "normalcomputing/extended-mind-mpt-7b" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "normalcomputing/extended-mind-mpt-7b",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "normalcomputing/extended-mind-mpt-7b" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "normalcomputing/extended-mind-mpt-7b",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use normalcomputing/extended-mind-mpt-7b with Docker Model Runner:
```
docker model run hf.co/normalcomputing/extended-mind-mpt-7b
```

phoebeklett commited on May 28, 2024

Commit

f73b01f

verified ·

1 Parent(s): e9e7545

Delete modeling_mpt.py

Browse files

Files changed (1) hide show

modeling_mpt.py +0 -833

modeling_mpt.py DELETED Viewed

@@ -1,833 +0,0 @@
-# Adapted from https://github.com/mosaicml/llm-foundry
-# Classes changed: MPTModel, MPTForCausalLM
-# SPDX-License-Identifier: Apache-2.0
-"""A simple, flexible implementation of a GPT model.
-Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
-"""
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.linalg import vector_norm
-import faiss
-from einops import rearrange
-from composer.utils import dist
-from omegaconf import DictConfig
-from transformers import (PreTrainedModel, PreTrainedTokenizer,
-                          PreTrainedTokenizerFast)
-from transformers.modeling_outputs import (BaseModelOutputWithPast,
-                                           CausalLMOutputWithPast)
-from llmfoundry.models.layers.custom_embedding import SharedEmbedding
-from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY
-from llmfoundry.models.utils.param_init_fns import MODEL_INIT_REGISTRY
-from .configuration import ExtendedMPTConfig
-from .attention import attn_bias_shape, build_attn_bias
-from .blocks import MPTBlock
-from .utils import instantiate_from_config
-Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
-class MPTPreTrainedModel(PreTrainedModel):
-    config_class = ExtendedMPTConfig
-    base_model_prefix = 'model'
-    _no_split_modules = ['MPTBlock']
-class ExtendedMPTModel(MPTPreTrainedModel):
-    def __init__(self, config: ExtendedMPTConfig):
-        config._validate_config()
-        super().__init__(config)
-        self.attn_impl = config.attn_config['attn_impl']
-        self.prefix_lm = config.attn_config['prefix_lm']
-        self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
-        self.alibi = config.attn_config['alibi']
-        self.alibi_bias_max = config.attn_config['alibi_bias_max']
-        self.mask_by_sim = config.attn_config['mask_by_sim']
-        self.sim_threshold = config.attn_config['sim_threshold']
-        self.topk = config.attn_config['topk']
-        self.use_active_externalism = config.attn_config['use_active_externalism']
-        self.use_active_externalism_by_layer = config.use_active_externalism_by_layer
-        if config.init_device == 'mixed':
-            if dist.get_local_rank() == 0:
-                config.init_device = 'cpu'
-            else:
-                config.init_device = 'meta'
-        if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
-            norm_options = ' | '.join(NORM_CLASS_REGISTRY.keys())
-            raise NotImplementedError(
-                f'Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).'
-            )
-        norm_class = NORM_CLASS_REGISTRY[config.norm_type.lower()]
-        # CogView (https://arxiv.org/abs/2105.13290) and GLM-130B (https://arxiv.org/abs/2210.02414)
-        # both report this helping with stabilizing training
-        self.embedding_fraction = config.embedding_fraction
-        self.wte = SharedEmbedding(config.vocab_size,
-                                   config.d_model,
-                                   device=config.init_device)
-        if not self.alibi:
-            self.wpe = torch.nn.Embedding(config.max_seq_len,
-                                          config.d_model,
-                                          device=config.init_device)
-        self.emb_drop = nn.Dropout(config.emb_pdrop)
-        self.blocks = nn.ModuleList([
-            MPTBlock(
-                device=config.init_device,
-                **config.to_dict(),
-            ) for _ in range(config.n_layers)
-        ])
-        self.norm_f = norm_class(config.d_model, device=config.init_device)
-        if config.init_device != 'meta':
-            print(
-                f'You are using {config.init_device=}, but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.'
-            )
-            self.apply(self.param_init_fn)
-        self.is_causal = not self.prefix_lm
-        # define attn mask
-        self._attn_bias_initialized = False
-        self.attn_bias = None
-        self.attn_bias_shape = attn_bias_shape(
-            self.attn_impl,
-            config.n_heads,
-            config.max_seq_len,
-            self.alibi,
-            prefix_lm=self.prefix_lm,
-            causal=self.is_causal,
-            use_sequence_id=self.attn_uses_sequence_id,
-        )
-        self._attn_bias_ae_initialized = False #for active externalism
-        self.attn_bias_ae = None
-        if self.config.no_bias:
-            for module in self.modules():
-                if hasattr(module, 'bias') and isinstance(
-                        module.bias, nn.Parameter):
-                    if self.config.verbose:
-                        warnings.warn(
-                            f'Removing bias ({module.bias}) from {module}.')
-                    module.register_parameter('bias', None)
-        # Print verbose info
-        if config.verbose and config.verbose > 2:
-            print(self)
-        if 'verbose' not in self.config.init_config:
-            self.config.init_config['verbose'] = self.config.verbose
-        if self.config.init_config['verbose'] > 1:
-            init_fn_name = self.config.init_config['name']
-            warnings.warn(f'Using {init_fn_name} initialization.')
-    def get_input_embeddings(self):
-        return self.wte
-    def set_input_embeddings(self, value: nn.Embedding):
-        self.wte = value
-    @torch.no_grad()
-    def _attn_bias(
-        self,
-        device,
-        dtype,
-        attention_mask: Optional[torch.ByteTensor] = None,
-        prefix_mask: Optional[torch.ByteTensor] = None,
-        sequence_id: Optional[torch.LongTensor] = None,
-        seq_len: Optional[int] = None,
-        use_active_externalism:bool=None,
-        topk=None,
-    ):
-        if not self._attn_bias_initialized:
-            if self.attn_bias_shape:
-                self.attn_bias = torch.zeros(self.attn_bias_shape,
-                                             device=device,
-                                             dtype=dtype)
-                self.attn_bias = build_attn_bias(
-                    self.attn_impl,
-                    self.config.n_heads,
-                    self.config.max_seq_len,
-                    device=device,
-                    dtype=dtype,
-                    attn_bias = self.attn_bias,
-                    causal=self.is_causal,
-                    alibi=self.alibi,
-                    alibi_bias_max=self.alibi_bias_max
-                )
-            self._attn_bias_initialized = True
-        if use_active_externalism: #for active externalism, init every time since seq_len changes
-            self.attn_bias_ae = build_attn_bias(
-                self.attn_impl,
-                self.config.n_heads,
-                seq_len,
-                device=device,
-                dtype=dtype,
-                causal=self.is_causal,
-                alibi=self.alibi,
-                alibi_bias_max=self.alibi_bias_max,
-                for_ae=use_active_externalism,
-                topk=topk
-            )
-            self._attn_bias_ae_initialized = True
-        # flash does not support prefix_lm and will incorporate any
-        # attention_mask inside the attention module
-        if self.attn_impl == 'flash':
-            return self.attn_bias, attention_mask
-        if self.attn_bias is not None:
-            # .to(*args, **kwargs) is a no-op if tensor is already on
-            # specified device or of specificed dtype
-            self.attn_bias = self.attn_bias.to(dtype=dtype, device=device)
-        attn_bias = self.attn_bias
-        if self.attn_bias_ae is not None: #for active externalism
-            self.attn_bias_ae = self.attn_bias_ae.to(dtype=dtype, device=device)
-        attn_bias_ae = self.attn_bias_ae
-        # If using torch or triton, we incorporate the prefix_mask (if appropriate)
-        if self.prefix_lm:
-            assert isinstance(attn_bias, torch.Tensor)  # pyright
-            assert isinstance(prefix_mask, torch.Tensor)  # pyright
-            attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask)
-        # If using torch or triton, we incorporate sequence_id (if appropriate)
-        if self.attn_uses_sequence_id and sequence_id is not None:
-            assert isinstance(attn_bias, torch.Tensor)  # pyright
-            attn_bias = self._apply_sequence_id(attn_bias, sequence_id)
-        # If using torch or triton, we incorporate attention_mask. This will output
-        # None in place of attention_mask since it will not be further needed in the
-        # attention modules.
-        if attention_mask is not None:
-            s_k = attention_mask.shape[-1]
-            if attn_bias is None:
-                attn_bias = torch.zeros((1, 1, 1, s_k),
-                                        device=device,
-                                        dtype=dtype)
-            else:
-                # clamp to 0 necessary for torch 2.0 compile()
-                _s_k = max(0, attn_bias.size(-1) - s_k)
-                attn_bias = attn_bias[:, :, :, _s_k:]
-            if prefix_mask is not None and (attention_mask.shape !=
-                                            prefix_mask.shape):
-                raise ValueError(
-                    f'attention_mask shape={attention_mask.shape} ' +
-                    f'and prefix_mask shape={prefix_mask.shape} are not equal.')
-            min_val = torch.finfo(attn_bias.dtype).min
-            attn_bias = attn_bias.masked_fill(
-                ~attention_mask.view(-1, 1, 1, s_k), min_val)
-        return attn_bias, attn_bias_ae, None
-    def _apply_prefix_mask(self, attn_bias: torch.Tensor,
-                           prefix_mask: torch.Tensor):
-        s_k, s_q = attn_bias.shape[-2:]
-        if (s_k != self.config.max_seq_len) or (s_q != self.config.max_seq_len):
-            raise ValueError(
-                'attn_bias does not match the expected shape. ' +
-                f'The last two dimensions should both be {self.config.max_length} '
-                + f'but are {s_k} and {s_q}.')
-        seq_len = prefix_mask.shape[-1]
-        if seq_len > self.config.max_seq_len:
-            raise ValueError(
-                f'prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}'
-            )
-        # select seq_len subset of attn mask
-        attn_bias = attn_bias[..., :seq_len, :seq_len]
-        # Mix the causal max and the bidirectional mask to get the full
-        # allowable attention (i.e. full = not accounting for padding yet)
-        causal = torch.tril(
-            torch.ones((seq_len, seq_len),
-                       dtype=torch.bool,
-                       device=prefix_mask.device)).view(1, 1, seq_len, seq_len)
-        prefix = prefix_mask.view(-1, 1, 1, seq_len)
-        cannot_attend = ~torch.logical_or(causal, prefix.bool())
-        min_val = torch.finfo(attn_bias.dtype).min
-        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
-        return attn_bias
-    def _apply_sequence_id(self, attn_bias: torch.Tensor,
-                           sequence_id: torch.LongTensor):
-        seq_len = sequence_id.shape[-1]
-        if seq_len > self.config.max_seq_len:
-            raise ValueError(
-                f'sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}'
-            )
-        # select seq_len subset of attn mask
-        attn_bias = attn_bias[..., :seq_len, :seq_len]
-        # Restrict attention to tokens that share the same value
-        # in sequence_id
-        cannot_attend = torch.logical_not(
-            torch.eq(
-                sequence_id.view(-1, seq_len, 1),
-                sequence_id.view(-1, 1, seq_len),
-            )).unsqueeze(1)
-        min_val = torch.finfo(attn_bias.dtype).min
-        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
-        return attn_bias
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
-        attention_mask: Optional[torch.ByteTensor] = None,
-        prefix_mask: Optional[torch.ByteTensor] = None,
-        sequence_id: Optional[torch.LongTensor] = None,
-        return_dict: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        use_cache: Optional[bool] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        use_active_externalism:Optional[bool]=None,
-        long_range_past_key_values:Optional[List[Tuple[torch.FloatTensor]]] = None,
-        faiss_indexes:Tuple=None,
-        topk:int=None,
-    ):
-        return_dict = (return_dict
-                       if return_dict is not None else self.config.return_dict)
-        use_cache = (use_cache
-                     if use_cache is not None else self.config.use_cache)
-        use_active_externalism = (use_active_externalism
-                     if use_active_externalism is not None else self.use_active_externalism)
-        topk = (topk if topk is not None else self.topk)
-        if attention_mask is not None:
-            attention_mask = attention_mask.bool()
-        if prefix_mask is not None:
-            prefix_mask = prefix_mask.bool()
-        # These args are passed in by keyword in huggingface's generate function
-        # https://github.com/huggingface/transformers/blob/68287689f2f0d8b7063c400230b3766987abf18d/src/transformers/generation/utils.py#L2201-L2206
-        # but have not yet been fully implemented in MPTModel
-        if not return_dict:
-            raise NotImplementedError(
-                'return_dict False is not implemented yet for MPT')
-        if output_attentions:
-            if self.attn_impl != 'torch':
-                raise NotImplementedError(
-                    'output_attentions is not implemented for MPT when using attn_impl `flash` or `triton`.'
-                )
-        if (attention_mask is not None and
-                attention_mask[:, 0].sum() != attention_mask.shape[0] and
-                self.training):
-            raise NotImplementedError(
-                'MPT does not support training with left padding.')
-        if self.prefix_lm and prefix_mask is None:
-            raise ValueError(
-                'prefix_mask is a required argument when MPT is configured with prefix_lm=True.'
-            )
-        # Raise a not implemented error if input_embeds is not None (this is an arg in huggingface transformers and we need to support it for PEFT)
-        if inputs_embeds is not None:
-            raise NotImplementedError(
-                'inputs_embeds is not implemented for MPT.')
-        if self.training:
-            if self.attn_uses_sequence_id and sequence_id is None:
-                raise ValueError(
-                    'sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True '
-                    + 'and the model is in train mode.')
-            elif (self.attn_uses_sequence_id is False) and (sequence_id
-                                                            is not None):
-                warnings.warn(
-                    'MPT received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. '
-                    +
-                    'This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True.'
-                )
-        S = input_ids.size(1)
-        assert (
-            S <= self.config.max_seq_len
-        ), f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
-        tok_emb = self.wte(input_ids)  # type: ignore
-        if self.alibi:
-            x = tok_emb
-        else:
-            past_position = 0
-            if past_key_values is not None:
-                if len(past_key_values) != self.config.n_layers:
-                    raise ValueError(
-                        f'past_key_values must provide a past_key_value for each attention '
-                        +
-                        f'layer in the network ({len(past_key_values)=}; {self.config.n_layers=}).'
-                    )
-                # For attn_impl: triton and flash the past key tensor spec is (batch, seq, dim).
-                # For attn_impl: torch the past key tensor spec is (batch, heads, head_dim, seq).
-                # Here we shift position embedding using the `seq` dim of the past key
-                past_position = past_key_values[0][0].size(1)
-                if self.attn_impl == 'torch':
-                    past_position = past_key_values[0][0].size(3)
-            if S + past_position > self.config.max_seq_len:
-                raise ValueError(
-                    f'Cannot forward input with past sequence length {past_position} and current sequence length '
-                    f'{S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}.'
-                )
-            pos = torch.arange(
-                past_position,
-                S + past_position,
-                dtype=torch.long,
-                device=input_ids.device,
-            ).unsqueeze(0)
-            if attention_mask is not None:
-                # adjust the position indices to account for padding tokens
-                pos = torch.clamp(
-                    pos - torch.cumsum((~attention_mask).to(torch.int32),
-                                       dim=1)[:, past_position:],
-                    min=0,
-                )
-            pos_emb = self.wpe(pos)  # type: ignore
-            x = tok_emb + pos_emb
-        if self.embedding_fraction == 1:
-            x = self.emb_drop(x)  # type: ignore
-        else:
-            # this implementation is proposed on page 7 of the GLM-130B paper https://arxiv.org/abs/2210.02414
-            x_shrunk = (x * self.embedding_fraction) + (
-                x.detach() * (1 - self.embedding_fraction))
-            assert isinstance(self.emb_drop, nn.Module)  # pyright
-            x = self.emb_drop(x_shrunk)
-        seq_len = S #for active externalism
-        if past_key_values is not None:
-            past_position = past_key_values[0][0].size(-1)
-            seq_len += past_position
-        attn_bias, attn_bias_ae, attention_mask = self._attn_bias(
-            device=x.device,
-            dtype=torch.float32,
-            attention_mask=attention_mask,
-            prefix_mask=prefix_mask,
-            sequence_id=sequence_id,
-            seq_len = seq_len,
-            use_active_externalism=use_active_externalism,
-            topk=topk
-        )
-        # initialize the past key values cache if it should be used
-        if use_cache and past_key_values is None:
-            past_key_values = [() for _ in range(self.config.n_layers)
-                              ]  # type: ignore
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        all_idx = () if output_attentions else None
-        for b_idx, block in enumerate(self.blocks):  # type: ignore
-            if output_hidden_states:
-                assert all_hidden_states is not None  # pyright
-                all_hidden_states = all_hidden_states + (x,)
-            past_key_value = (past_key_values[b_idx]
-                              if past_key_values is not None else None)
-            long_range_past_key_value = (long_range_past_key_values[b_idx]
-                                          if (long_range_past_key_values is not None and self.use_active_externalism_by_layer[b_idx] and use_active_externalism is True) else None)
-            if long_range_past_key_value is not None and faiss_indexes is not None:
-                raise NotImplementedError(
-                    'Using faiss and passing key value pairs manually are mutually exclusive right now.')
-            x, attn_weights, past_key_value, reshaped_idx = block(
-                x,
-                past_key_value=past_key_value,
-                long_range_past_key_value=long_range_past_key_value,
-                attn_bias=attn_bias,
-                attention_mask=attention_mask,
-                attn_bias_ae=attn_bias_ae,
-                is_causal=self.is_causal,
-                topk=topk,
-                needs_weights=output_attentions,
-                faiss_indexes=faiss_indexes,
-                n_layers=self.config.n_layers,
-                current_layer=b_idx,
-                mask_by_sim=self.mask_by_sim,
-                sim_threshold=self.sim_threshold,
-            )
-            if past_key_values is not None:
-                past_key_values[b_idx] = past_key_value
-            if output_attentions:
-                assert all_self_attns is not None  # pyright
-                all_self_attns = all_self_attns + (attn_weights,)
-                assert all_idx is not None
-                all_idx = all_idx + (reshaped_idx,)
-        x = self.norm_f(x)  # type: ignore
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            assert all_hidden_states is not None  # pyright
-            all_hidden_states = all_hidden_states + (x,)
-        return BaseModelOutputWithPast(
-            last_hidden_state=x,
-            past_key_values=past_key_values,
-            hidden_states=all_hidden_states,
-            attentions=(all_self_attns, all_idx), #return reshaped_idx for active externalism
-        )
-    # Param Initialization, needed for device='meta' fast initialization
-    def param_init_fn(self, module):
-        init_fn_name = self.config.init_config['name']
-        MODEL_INIT_REGISTRY[init_fn_name](
-            module=module,
-            n_layers=self.config.n_layers,
-            d_model=self.config.d_model,
-            **self.config.init_config,
-        )
-    # FSDP Wrap function
-    def fsdp_wrap_fn(self, module):
-        return isinstance(module, MPTBlock)
-    # Activation Checkpointing
-    def activation_checkpointing_fn(self, module):
-        return isinstance(module, MPTBlock)
-class ExtendedMPTForCausalLM(MPTPreTrainedModel):
-    def __init__(self, config:ExtendedMPTConfig, external_memories=None):
-        if isinstance(config, DictConfig):
-            config = instantiate_from_config(config)
-        super().__init__(config)
-        if not config.tie_word_embeddings:
-            raise ValueError(
-                'MPTForCausalLM only supports tied word embeddings')
-        print(f'Instantiating an MPTForCausalLM model from {__file__}')
-        self.transformer: ExtendedMPTModel = ExtendedMPTModel(config)
-        self.use_active_externalism = config.attn_config['use_active_externalism']
-        self.memory_type = config.attn_config['memory_type']
-        self._memories = None
-        self.memory_device = config.memory_device
-        for child in self.transformer.children():
-            if isinstance(child, torch.nn.ModuleList):
-                continue
-            if isinstance(child, torch.nn.Module):
-                child._fsdp_wrap = True
-        # enables scaling output logits; similar to a softmax "temperature"
-        # PaLM paper uses scale 1/sqrt(config.d_model)
-        self.logit_scale = None
-        if config.logit_scale is not None:
-            logit_scale = config.logit_scale
-            if isinstance(logit_scale, str):
-                if logit_scale == 'inv_sqrt_d_model':
-                    logit_scale = 1 / math.sqrt(config.d_model)
-                else:
-                    raise ValueError(
-                        f"{logit_scale=} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'."
-                    )
-            self.logit_scale = logit_scale
-        if external_memories is not None:
-            self._memories = external_memories
-            self.memories = None
-    def set_memories(self, memories):
-        self.memories = memories
-    def empty_memories(self):
-        self.memories = None
-    def get_input_embeddings(self):
-        return self.transformer.wte
-    def set_input_embeddings(self, value):
-        self.transformer.wte = value
-    def get_output_embeddings(self):
-        return self.transformer.wte
-    def set_output_embeddings(self, new_embeddings):
-        self.transformer.wte = new_embeddings
-    def set_decoder(self, decoder):
-        self.transformer = decoder
-    def get_decoder(self):
-        return self.transformer
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
-        attention_mask: Optional[torch.ByteTensor] = None,
-        prefix_mask: Optional[torch.ByteTensor] = None,
-        sequence_id: Optional[torch.LongTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        return_dict: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        use_cache: Optional[bool] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_active_externalism: Optional[bool]=None,
-        topk:int=None
-    ):
-        if self._memories is not None and self.memories is None: #init memories once on first call
-            self.memories = self.generate_cache(self._memories, cache_type=self.memory_type)
-        return_dict = (return_dict
-                       if return_dict is not None else self.config.return_dict)
-        use_cache = (use_cache
-                     if use_cache is not None else self.config.use_cache)
-        use_active_externalism = (use_active_externalism
-                     if use_active_externalism is not None else self.use_active_externalism)
-        topk = topk if topk is not None else None
-        # if input_embeds is not none, raise a not implemented error
-        if inputs_embeds is not None:
-            raise NotImplementedError(
-                'inputs_embeds has to be None (for hf/peft support).')
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        if hasattr(self, "memories") and type(self.memories)==list:
-            long_range_past_key_values = self.memories
-            faiss_indexes = None
-        elif hasattr(self, "memories"):
-            long_range_past_key_values = None
-            faiss_indexes = self.memories
-        else:
-            long_range_past_key_values = None
-            faiss_indexes = None
-        outputs = self.transformer(
-            input_ids=input_ids,
-            past_key_values=past_key_values,
-            long_range_past_key_values=long_range_past_key_values,
-            faiss_indexes=faiss_indexes,
-            attention_mask=attention_mask,
-            prefix_mask=prefix_mask,
-            sequence_id=sequence_id,
-            return_dict=return_dict,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            use_cache=use_cache,
-            use_active_externalism=use_active_externalism,
-            topk=topk
-        )
-        # move outputs to same device as weights for token embedding
-        # needed to support HF `device_map`
-        logits = self.transformer.wte(
-            outputs.last_hidden_state.to(self.transformer.wte.weight.device),
-            True,
-        )
-        if self.logit_scale is not None:
-            if self.logit_scale == 0:
-                warnings.warn(
-                    f'Multiplying logits by {self.logit_scale=}. This will produce uniform (uninformative) outputs.'
-                )
-            logits *= self.logit_scale
-        loss = None
-        if labels is not None:
-            _labels = torch.roll(labels, shifts=-1)
-            _labels[:, -1] = -100
-            loss = F.cross_entropy(
-                logits.view(-1, logits.size(-1)),
-                _labels.to(logits.device).view(-1),
-            )
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-    # Param Initialization, needed for device='meta' fast initialization
-    def param_init_fn(self, module):
-        init_fn_name = self.config.init_config['name']
-        MODEL_INIT_REGISTRY[init_fn_name](
-            module=module,
-            n_layers=self.config.n_layers,
-            d_model=self.config.d_model,
-            **self.config.init_config,
-        )
-    # FSDP Wrap function
-    def fsdp_wrap_fn(self, module):
-        return isinstance(module, MPTBlock)
-    # Activation Checkpointing
-    def activation_checkpointing_fn(self, module):
-        return isinstance(module, MPTBlock)
-    def generate_cache(self,
-                       input_ids:torch.LongTensor,
-                       stride:int=512,
-                       max_len:int=2048,
-                       cache_type:str='manual'):
-        if cache_type not in ['manual', 'faiss']:
-            raise NotImplementedError(f"Cache type {cache_type} not implemented.")
-        prev_end_loc=0
-        long_range_past_key_values = None
-        faiss_indexes= None
-        for b_idx in range(0, input_ids.size(-1), stride): #generate kv-pairs using stride
-            end_loc = min(b_idx + max_len, input_ids.size(-1))
-            trg_len = end_loc - prev_end_loc
-            subseq = input_ids[:, b_idx:end_loc].to(self.device)
-            with torch.no_grad():
-                outputs = self.transformer(subseq, use_cache=True, use_active_externalism=False)
-            to_cache = [(
-                        kv[0][:,:,:,-trg_len:],
-                        kv[1][:,:,-trg_len:])
-                        for kv in outputs.past_key_values
-                        ]
-            long_range_past_key_values, faiss_indexes = self.cache(to_cache, cache_type, long_range_past_key_values=long_range_past_key_values, faiss_indexes=faiss_indexes)
-            prev_end_loc = end_loc
-            if end_loc == input_ids.size(-1):
-                break
-        if long_range_past_key_values is not None:
-            return long_range_past_key_values
-        else:
-            return faiss_indexes
-    def cache(self,
-              to_cache:List,
-              cache_type:str='manual',
-              long_range_past_key_values:List=None,
-              faiss_indexes:faiss.IndexFlatIP=None,
-              max_length_cache=100000,
-              verbose=False):
-        if long_range_past_key_values is not None and faiss_indexes is not None:
-            raise NotImplementedError("Using faiss and passing key value pairs manually are mutually exclusive right now.")
-        if cache_type=='faiss': #add one-hot encoding to match layer, head indices
-            one_hot_encodings = F.one_hot(torch.arange(0, self.config.n_heads*self.config.n_layers))*10
-            if faiss_indexes is None:
-                faiss_indexes = (faiss.IndexFlatIP(to_cache[0][0].size(-2)+one_hot_encodings.size(-1)), faiss.IndexFlatIP(to_cache[0][1].size(-1)*2))
-            kn_index, kv_index = faiss_indexes
-            for b_idx, (k, v) in enumerate(to_cache):
-                k_n = (k/vector_norm(k, ord=2, dim=-2, keepdim=True)).to('cpu')
-                k_n = torch.concat([rearrange(k_n, 'b h d s -> b (h s) d', h=self.config.n_heads), one_hot_encodings[self.config.n_heads*b_idx:self.config.n_heads*(b_idx+1)].unsqueeze(0).repeat_interleave(repeats=k.size(-1), dim=-2)], dim=-1)
-                kn_index.add(k_n.squeeze().numpy())
-                k= rearrange(k, 'b h d s -> b (h s) d', h=self.config.n_heads)
-                v= rearrange(v, 'b h s d -> b (h s) d', h=self.config.n_heads)
-                kv_index.add(torch.concat([v.squeeze(), k.squeeze()], dim=1).to('cpu').numpy())
-        else:
-            if long_range_past_key_values is None:
-                long_range_past_key_values = [(k.to(self.memory_device),v.to(self.memory_device)) for k,v in to_cache]
-            else:
-                long_range_past_key_values = [
-                    (
-                    torch.concat([kv[0], to_cache[ind][0].to(self.memory_device)], dim=3),
-                    torch.concat([kv[1], to_cache[ind][1].to(self.memory_device)], dim=2)
-                    )
-                    for ind, kv in enumerate(long_range_past_key_values)
-                ]
-        if long_range_past_key_values is not None: #set a limit on manual memory length
-            if long_range_past_key_values[0][0].size(-1) > max_length_cache:
-                long_range_past_key_values = [
-                    (
-                        kv[0][:, :, :, -max_length_cache:],
-                        kv[1][:, :, -max_length_cache:]
-                    )
-                    for kv in long_range_past_key_values]
-        if verbose:
-            if cache_type == 'faiss':
-                print(f"{kn_index.ntotal} keys in faiss index")
-            else:
-                print(f"{long_range_past_key_values[0][0].size(-1)} cached kvs")
-        return long_range_past_key_values, (kn_index, kv_index) if cache_type == 'faiss' else None
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        inputs_embeds=None,
-        **kwargs,
-    ):
-        if inputs_embeds is not None:
-            raise NotImplementedError(
-                'inputs_embeds is not implemented for MPT yet')
-        attention_mask = kwargs['attention_mask'].bool()
-        if attention_mask[:, -1].sum() != attention_mask.shape[0]:
-            raise NotImplementedError(
-                'MPT does not support generation with right padding.')
-        if self.transformer.attn_uses_sequence_id and self.training:
-            sequence_id = torch.zeros_like(input_ids[:1])
-        else:
-            sequence_id = None
-        if past_key_values is not None:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-        if self.transformer.prefix_lm:
-            # Leverage a convenience of sequential generation!
-            prefix_mask = torch.ones_like(attention_mask)
-            # This requires that we're using the cache
-            if kwargs.get('use_cache') == False:
-                raise NotImplementedError(
-                    'MPT with prefix_lm=True does not support use_cache=False.')
-        else:
-            prefix_mask = None
-        return {
-            'input_ids': input_ids,
-            'attention_mask': attention_mask,
-            'prefix_mask': prefix_mask,
-            'sequence_id': sequence_id,
-            'past_key_values': past_key_values,
-            'use_cache': kwargs.get('use_cache', True),
-            'use_active_externalism': kwargs.get('use_active_externalism'), #add a few more kwargs for active externalism
-            'topk': kwargs.get('topk', None),
-        }
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        """Used by HuggingFace generate when using beam search with kv-caching.
-        See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133
-        for an example in transformers.
-        """
-        reordered_past = []
-        for layer_past in past_key_values:
-            reordered_past += [
-                tuple(
-                    past_state.index_select(0, beam_idx)
-                    for past_state in layer_past)
-            ]
-        return reordered_past