Instructions to use normalcomputing/extended-mind-mpt-7b-chat with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use normalcomputing/extended-mind-mpt-7b-chat with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="normalcomputing/extended-mind-mpt-7b-chat", trust_remote_code=True)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("normalcomputing/extended-mind-mpt-7b-chat", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use normalcomputing/extended-mind-mpt-7b-chat with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "normalcomputing/extended-mind-mpt-7b-chat"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "normalcomputing/extended-mind-mpt-7b-chat",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/normalcomputing/extended-mind-mpt-7b-chat

SGLang

How to use normalcomputing/extended-mind-mpt-7b-chat with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "normalcomputing/extended-mind-mpt-7b-chat" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "normalcomputing/extended-mind-mpt-7b-chat",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "normalcomputing/extended-mind-mpt-7b-chat" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "normalcomputing/extended-mind-mpt-7b-chat",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use normalcomputing/extended-mind-mpt-7b-chat with Docker Model Runner:
```
docker model run hf.co/normalcomputing/extended-mind-mpt-7b-chat
```

Upload 4 files

by phoebeklett - opened Oct 25, 2023

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+26

-26

Files changed (4) hide show

attention.py +5 -6
blocks.py +1 -1
configuration.py +5 -0
modeling_mpt.py +15 -19

attention.py CHANGED Viewed

@@ -95,10 +95,10 @@ def scaled_multihead_dot_product_attention(
             )
         attn_weight = attn_weight + attn_bias
-    if needs_weights:
         reshaped_idx = None
     if long_range_past_key_value is not None or faiss_indexes is not None:
-        if long_range_past_key_value is not None: #manual external memories
             k_cache, v_cache = long_range_past_key_value
             s_cache = k_cache.size(-1)
@@ -134,15 +134,14 @@ def scaled_multihead_dot_product_attention(
             selected_k=rearrange(torch.tensor(kv_index.reconstruct_batch(I.flatten()))[:,:d], '(h s) d -> 1 h d s', h=32).to(q.device)
             selected_v=rearrange(torch.tensor(kv_index.reconstruct_batch(I.flatten()))[:,d:], '(h s) d -> 1 h s d', h=32).to(q.device)
         s_k_ae = selected_k.size(-1)
         s_k += s_k_ae
         attn_weight_cache = q.matmul(selected_k) * softmax_scale
         if mask_by_sim:
             attn_weight_cache = attn_weight_cache.masked_fill(sim_mask, min_val)
-        if attn_bias_ae is not None:
-            # clamp to 0 necessary for torch 2.0 compile()
             _s_q = max(0, attn_bias_ae.size(2) - s_q)
             _s_k = max(0, attn_bias_ae.size(3) - s_k_ae)
             attn_bias_ae = attn_bias_ae[:, :, _s_q:, _s_k:]
@@ -710,7 +709,7 @@ def build_attn_bias(
                         for_ae=for_ae,
                         topk=topk
                     ))
-            else:
                 attn_bias = build_alibi_bias(
                         n_heads,
                         seq_len,

             )
         attn_weight = attn_weight + attn_bias
+    if needs_weights: #will return memory indices w/attention weights
         reshaped_idx = None
     if long_range_past_key_value is not None or faiss_indexes is not None:
+        if long_range_past_key_value is not None: #manual memories
             k_cache, v_cache = long_range_past_key_value
             s_cache = k_cache.size(-1)
             selected_k=rearrange(torch.tensor(kv_index.reconstruct_batch(I.flatten()))[:,:d], '(h s) d -> 1 h d s', h=32).to(q.device)
             selected_v=rearrange(torch.tensor(kv_index.reconstruct_batch(I.flatten()))[:,d:], '(h s) d -> 1 h s d', h=32).to(q.device)
         s_k_ae = selected_k.size(-1)
         s_k += s_k_ae
         attn_weight_cache = q.matmul(selected_k) * softmax_scale
         if mask_by_sim:
             attn_weight_cache = attn_weight_cache.masked_fill(sim_mask, min_val)
+        if attn_bias_ae is not None: #add alibi bias to memories
             _s_q = max(0, attn_bias_ae.size(2) - s_q)
             _s_k = max(0, attn_bias_ae.size(3) - s_k_ae)
             attn_bias_ae = attn_bias_ae[:, :, _s_q:, _s_k:]
                         for_ae=for_ae,
                         topk=topk
                     ))
+            else: #for memories
                 attn_bias = build_alibi_bias(
                         n_heads,
                         seq_len,

blocks.py CHANGED Viewed

@@ -7,7 +7,7 @@
 from typing import Dict, Optional, Tuple
 import torch
 import torch.nn as nn
-from .attention import ATTN_CLASS_REGISTRY
 from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY
 class MPTMLP(nn.Module):

 from typing import Dict, Optional, Tuple
 import torch
 import torch.nn as nn
+from extended_mind_transformers.mpt.attention import ATTN_CLASS_REGISTRY
 from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY
 class MPTMLP(nn.Module):

configuration.py CHANGED Viewed

@@ -165,6 +165,11 @@ class ExtendedMPTConfig(PretrainedConfig):
             init_config_defaults,
         )
         if self.d_model % self.n_heads != 0:
             raise ValueError('d_model must be divisible by n_heads')
         if any(

             init_config_defaults,
         )
+        if self.attn_config['memory_type']=='faiss' and self.attn_config['mask_by_sim'] is True:
+            raise ValueError(
+                'mask_by_sim is not supported for faiss memory type.'
+            )
         if self.d_model % self.n_heads != 0:
             raise ValueError('d_model must be divisible by n_heads')
         if any(

modeling_mpt.py CHANGED Viewed

@@ -27,10 +27,10 @@ from llmfoundry.models.layers.custom_embedding import SharedEmbedding
 from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY
 from llmfoundry.models.utils.param_init_fns import MODEL_INIT_REGISTRY
-from .configuration import ExtendedMPTConfig
-from .attention import attn_bias_shape, build_attn_bias
-from .blocks import MPTBlock
-from .utils import instantiate_from_config
 Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
@@ -111,7 +111,7 @@ class ExtendedMPTModel(MPTPreTrainedModel):
             causal=self.is_causal,
             use_sequence_id=self.attn_uses_sequence_id,
         )
-        self._attn_bias_ae_initialized = False
         self.attn_bias_ae = None
         if self.config.no_bias:
@@ -168,7 +168,7 @@ class ExtendedMPTModel(MPTPreTrainedModel):
                 )
             self._attn_bias_initialized = True
-        if use_active_externalism:
             self.attn_bias_ae = build_attn_bias(
                 self.attn_impl,
                 self.config.n_heads,
@@ -196,7 +196,7 @@ class ExtendedMPTModel(MPTPreTrainedModel):
         attn_bias = self.attn_bias
-        if self.attn_bias_ae is not None:
             self.attn_bias_ae = self.attn_bias_ae.to(dtype=dtype, device=device)
         attn_bias_ae = self.attn_bias_ae
@@ -417,9 +417,7 @@ class ExtendedMPTModel(MPTPreTrainedModel):
             assert isinstance(self.emb_drop, nn.Module)  # pyright
             x = self.emb_drop(x_shrunk)
-        # self._attn_bias_initialized = False #right now this needs to run each step
-        seq_len = S
         if past_key_values is not None:
             past_position = past_key_values[0][0].size(-1)
             seq_len += past_position
@@ -493,7 +491,7 @@ class ExtendedMPTModel(MPTPreTrainedModel):
             last_hidden_state=x,
             past_key_values=past_key_values,
             hidden_states=all_hidden_states,
-            attentions=(all_self_attns, all_idx),
         )
     # Param Initialization, needed for device='meta' fast initialization
@@ -598,7 +596,7 @@ class ExtendedMPTForCausalLM(MPTPreTrainedModel):
         use_active_externalism: Optional[bool]=None,
         topk:int=None
     ):
-        if self._memories is not None and self.memories is None:
             self.memories = self.generate_cache(self._memories, cache_type=self.memory_type)
         return_dict = (return_dict
@@ -702,9 +700,8 @@ class ExtendedMPTForCausalLM(MPTPreTrainedModel):
         prev_end_loc=0
         long_range_past_key_values = None
         faiss_indexes= None
-        for b_idx in range(0, input_ids.size(-1), stride):
             end_loc = min(b_idx + max_len, input_ids.size(-1))
             trg_len = end_loc - prev_end_loc
             subseq = input_ids[:, b_idx:end_loc].to(self.device)
             with torch.no_grad():
@@ -734,7 +731,7 @@ class ExtendedMPTForCausalLM(MPTPreTrainedModel):
         if long_range_past_key_values is not None and faiss_indexes is not None:
             raise NotImplementedError("Using faiss and passing key value pairs manually are mutually exclusive right now.")
-        if cache_type=='faiss':
             one_hot_encodings = F.one_hot(torch.arange(0, self.config.n_heads*self.config.n_layers))*10
             if faiss_indexes is None:
                 faiss_indexes = (faiss.IndexFlatIP(to_cache[0][0].size(-2)+one_hot_encodings.size(-1)), faiss.IndexFlatIP(to_cache[0][1].size(-1)*2))
@@ -747,7 +744,6 @@ class ExtendedMPTForCausalLM(MPTPreTrainedModel):
                 k= rearrange(k, 'b h d s -> b (h s) d', h=self.config.n_heads)
                 v= rearrange(v, 'b h s d -> b (h s) d', h=self.config.n_heads)
                 kv_index.add(torch.concat([v.squeeze(), k.squeeze()], dim=1).to('cpu').numpy())
         else:
             if long_range_past_key_values is None:
                 long_range_past_key_values = [(k.to(self.memory_device),v.to(self.memory_device)) for k,v in to_cache]
@@ -759,8 +755,8 @@ class ExtendedMPTForCausalLM(MPTPreTrainedModel):
                     )
                     for ind, kv in enumerate(long_range_past_key_values)
                 ]
-        if long_range_past_key_values is not None:
-            if long_range_past_key_values[0][0].size(-1) > max_length_cache: #set a limit on manual memory length
                 long_range_past_key_values = [
                     (
                         kv[0][:, :, :, -max_length_cache:],
@@ -816,7 +812,7 @@ class ExtendedMPTForCausalLM(MPTPreTrainedModel):
             'sequence_id': sequence_id,
             'past_key_values': past_key_values,
             'use_cache': kwargs.get('use_cache', True),
-            'use_active_externalism': kwargs.get('use_active_externalism'),
             'topk': kwargs.get('topk', None),
         }

 from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY
 from llmfoundry.models.utils.param_init_fns import MODEL_INIT_REGISTRY
+from extended_mind_transformers.mpt.configuration import ExtendedMPTConfig
+from extended_mind_transformers.mpt.attention import attn_bias_shape, build_attn_bias
+from extended_mind_transformers.mpt.blocks import MPTBlock
+from extended_mind_transformers.utils import instantiate_from_config
 Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
             causal=self.is_causal,
             use_sequence_id=self.attn_uses_sequence_id,
         )
+        self._attn_bias_ae_initialized = False #for active externalism
         self.attn_bias_ae = None
         if self.config.no_bias:
                 )
             self._attn_bias_initialized = True
+        if use_active_externalism: #for active externalism, init every time since seq_len changes
             self.attn_bias_ae = build_attn_bias(
                 self.attn_impl,
                 self.config.n_heads,
         attn_bias = self.attn_bias
+        if self.attn_bias_ae is not None: #for active externalism
             self.attn_bias_ae = self.attn_bias_ae.to(dtype=dtype, device=device)
         attn_bias_ae = self.attn_bias_ae
             assert isinstance(self.emb_drop, nn.Module)  # pyright
             x = self.emb_drop(x_shrunk)
+        seq_len = S #for active externalism
         if past_key_values is not None:
             past_position = past_key_values[0][0].size(-1)
             seq_len += past_position
             last_hidden_state=x,
             past_key_values=past_key_values,
             hidden_states=all_hidden_states,
+            attentions=(all_self_attns, all_idx), #return reshaped_idx for active externalism
         )
     # Param Initialization, needed for device='meta' fast initialization
         use_active_externalism: Optional[bool]=None,
         topk:int=None
     ):
+        if self._memories is not None and self.memories is None: #init memories once on first call
             self.memories = self.generate_cache(self._memories, cache_type=self.memory_type)
         return_dict = (return_dict
         prev_end_loc=0
         long_range_past_key_values = None
         faiss_indexes= None
+        for b_idx in range(0, input_ids.size(-1), stride): #generate kv-pairs using stride
             end_loc = min(b_idx + max_len, input_ids.size(-1))
             trg_len = end_loc - prev_end_loc
             subseq = input_ids[:, b_idx:end_loc].to(self.device)
             with torch.no_grad():
         if long_range_past_key_values is not None and faiss_indexes is not None:
             raise NotImplementedError("Using faiss and passing key value pairs manually are mutually exclusive right now.")
+        if cache_type=='faiss': #add one-hot encoding to match layer, head indices
             one_hot_encodings = F.one_hot(torch.arange(0, self.config.n_heads*self.config.n_layers))*10
             if faiss_indexes is None:
                 faiss_indexes = (faiss.IndexFlatIP(to_cache[0][0].size(-2)+one_hot_encodings.size(-1)), faiss.IndexFlatIP(to_cache[0][1].size(-1)*2))
                 k= rearrange(k, 'b h d s -> b (h s) d', h=self.config.n_heads)
                 v= rearrange(v, 'b h s d -> b (h s) d', h=self.config.n_heads)
                 kv_index.add(torch.concat([v.squeeze(), k.squeeze()], dim=1).to('cpu').numpy())
         else:
             if long_range_past_key_values is None:
                 long_range_past_key_values = [(k.to(self.memory_device),v.to(self.memory_device)) for k,v in to_cache]
                     )
                     for ind, kv in enumerate(long_range_past_key_values)
                 ]
+        if long_range_past_key_values is not None: #set a limit on manual memory length
+            if long_range_past_key_values[0][0].size(-1) > max_length_cache:
                 long_range_past_key_values = [
                     (
                         kv[0][:, :, :, -max_length_cache:],
             'sequence_id': sequence_id,
             'past_key_values': past_key_values,
             'use_cache': kwargs.get('use_cache', True),
+            'use_active_externalism': kwargs.get('use_active_externalism'), #add a few more kwargs for active externalism
             'topk': kwargs.get('topk', None),
         }