Instructions to use tiiuae/falcon-40b-instruct with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use tiiuae/falcon-40b-instruct with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="tiiuae/falcon-40b-instruct", trust_remote_code=True)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-40b-instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b-instruct", trust_remote_code=True)

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use tiiuae/falcon-40b-instruct with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "tiiuae/falcon-40b-instruct"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "tiiuae/falcon-40b-instruct",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/tiiuae/falcon-40b-instruct

SGLang

How to use tiiuae/falcon-40b-instruct with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "tiiuae/falcon-40b-instruct" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "tiiuae/falcon-40b-instruct",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "tiiuae/falcon-40b-instruct" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "tiiuae/falcon-40b-instruct",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use tiiuae/falcon-40b-instruct with Docker Model Runner:
```
docker model run hf.co/tiiuae/falcon-40b-instruct
```

Changes in modelling_RW.py to be able to handle past_key_values for faster model generations

#64

by puru22 - opened Jul 12, 2023

base: refs/heads/main

←

from: refs/pr/64

Discussion Files changed

+42

-19

Files changed (1) hide show

modelling_RW.py +42 -19

modelling_RW.py CHANGED Viewed

@@ -87,10 +87,18 @@ class RotaryEmbedding(torch.nn.Module):
         return self.cos_cached, self.sin_cached
-    def forward(self, q, k):
-        batch, seq_len, head_dim = q.shape
         cos, sin = self.cos_sin(seq_len, q.device, q.dtype)
-        return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
 def _make_causal_mask(
@@ -100,10 +108,10 @@ def _make_causal_mask(
     mask = torch.empty((target_length, target_length + past_key_values_length), dtype=torch.bool, device=device)
     # ONNX doesn't support `torch.Tensor.triu` properly, thus we use this workaround
     seq_ids = torch.arange(target_length, device=device)
-    mask[:, past_key_values_length:] = seq_ids[:, None] < seq_ids[None, :]
     if past_key_values_length > 0:
-        mask[:, :past_key_values_length] = False
     expanded_mask = mask[None, None, :, :].expand(batch_size, 1, target_length, target_length + past_key_values_length)
     return expanded_mask
@@ -264,20 +272,27 @@ class Attention(nn.Module):
         )
         value_layer = value_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim)
-        query_layer, key_layer = self.maybe_rotary(query_layer, key_layer)
         if layer_past is not None:
             past_key, past_value = layer_past
             # concatenate along seq_length dimension:
             #  - key: [batch_size * self.num_heads, head_dim, kv_length]
             #  - value: [batch_size * self.num_heads, kv_length, head_dim]
             key_layer = torch.cat((past_key, key_layer), dim=1)
             value_layer = torch.cat((past_value, value_layer), dim=1)
         _, kv_length, _ = key_layer.shape
         if use_cache is True:
-            present = (key_layer, value_layer)
         else:
             present = None
@@ -286,9 +301,14 @@ class Attention(nn.Module):
             key_layer_ = key_layer.reshape(batch_size, self.num_heads, -1, self.head_dim)
             value_layer_ = value_layer.reshape(batch_size, self.num_heads, -1, self.head_dim)
-            attn_output = F.scaled_dot_product_attention(
-                query_layer_, key_layer_, value_layer_, None, 0.0, is_causal=True
-            )
             x = attn_output.view(batch_size, self.num_heads, q_length, self.head_dim)
             x = x.permute(0, 2, 1, 3)
@@ -528,10 +548,10 @@ class RWModel(RWPreTrainedModel):
         device = attention_mask.device
         _, src_length = input_shape
-        if src_length > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape, device=device, past_key_values_length=past_key_values_length
-            )
         # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
         expanded_attn_mask = _expand_mask(attention_mask, tgt_length=src_length)
@@ -710,16 +730,19 @@ class RWForCausalLM(RWPreTrainedModel):
         **kwargs,
     ) -> dict:
         # only last token for input_ids if past is not None
-        if past:
             input_ids = input_ids[:, -1].unsqueeze(-1)
             # the cache may be in the stardard format (e.g. in contrastive search), convert to our's format if needed
-            if past[0][0].shape[0] == input_ids.shape[0]:
-                past = self._convert_to_rw_cache(past)
         return {
             "input_ids": input_ids,
-            "past_key_values": past,
             "use_cache": kwargs.get("use_cache"),
             "attention_mask": attention_mask,
         }

         return self.cos_cached, self.sin_cached
+    def forward(self, q, k, past_seq_length=None):
+        if past_seq_length == None :
+            batch, seq_len, head_dim = q.shape
+        else :
+            # print("past_seq_length", past_seq_length)
+            batch, input_seq_len, head_dim = q.shape
+            seq_len = past_seq_length + input_seq_len
         cos, sin = self.cos_sin(seq_len, q.device, q.dtype)
+        if past_seq_length != None :
+            return (q * cos[:, past_seq_length:, :]) + (rotate_half(q) * sin[:, past_seq_length:, :]), (k * cos[:, past_seq_length:, :]) + (rotate_half(k) * sin[:, past_seq_length:, :])
+        else :
+            return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
 def _make_causal_mask(
     mask = torch.empty((target_length, target_length + past_key_values_length), dtype=torch.bool, device=device)
     # ONNX doesn't support `torch.Tensor.triu` properly, thus we use this workaround
     seq_ids = torch.arange(target_length, device=device)
+    mask[:, past_key_values_length:] = seq_ids[:, None] >= seq_ids[None, :]
     if past_key_values_length > 0:
+        mask[:, :past_key_values_length] = True
     expanded_mask = mask[None, None, :, :].expand(batch_size, 1, target_length, target_length + past_key_values_length)
     return expanded_mask
         )
         value_layer = value_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim)
+        if layer_past is not None :
+            past_key, past_value = layer_past
+            past_kv_length = past_key.shape[2]
+            query_layer, key_layer = self.maybe_rotary(query_layer, key_layer, past_kv_length)
+        else :
+            query_layer, key_layer = self.maybe_rotary(query_layer, key_layer)
         if layer_past is not None:
             past_key, past_value = layer_past
             # concatenate along seq_length dimension:
             #  - key: [batch_size * self.num_heads, head_dim, kv_length]
             #  - value: [batch_size * self.num_heads, kv_length, head_dim]
+            past_key = past_key.permute(0, 2, 1)
             key_layer = torch.cat((past_key, key_layer), dim=1)
             value_layer = torch.cat((past_value, value_layer), dim=1)
         _, kv_length, _ = key_layer.shape
         if use_cache is True:
+            key_layer_permute = key_layer.permute(0, 2, 1)
+            present = (key_layer_permute, value_layer)
         else:
             present = None
             key_layer_ = key_layer.reshape(batch_size, self.num_heads, -1, self.head_dim)
             value_layer_ = value_layer.reshape(batch_size, self.num_heads, -1, self.head_dim)
+            if attention_mask is not None :
+                attn_output = F.scaled_dot_product_attention(
+                    query_layer_, key_layer_, value_layer_, attention_mask, 0.0, is_causal=False
+                )
+            else :
+                attn_output = F.scaled_dot_product_attention(
+                    query_layer_, key_layer_, value_layer_, None, 0.0, is_causal=True
+                )
             x = attn_output.view(batch_size, self.num_heads, q_length, self.head_dim)
             x = x.permute(0, 2, 1, 3)
         device = attention_mask.device
         _, src_length = input_shape
+        # if src_length > 1:
+        combined_attention_mask = _make_causal_mask(
+            input_shape, device=device, past_key_values_length=past_key_values_length
+        )
         # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
         expanded_attn_mask = _expand_mask(attention_mask, tgt_length=src_length)
         **kwargs,
     ) -> dict:
         # only last token for input_ids if past is not None
+        if kwargs.get("past_key_values", None) :
             input_ids = input_ids[:, -1].unsqueeze(-1)
+            past_key_values = kwargs["past_key_values"]
             # the cache may be in the stardard format (e.g. in contrastive search), convert to our's format if needed
+            # if kwargs["past_key_values"][0][0].shape[0] == input_ids.shape[0]:
+            #     past_key_values = self._convert_to_rw_cache(kwargs["past_key_values"])
+                # past_key_values = kwargs["past_key_values"]
+        else :
+            past_key_values = None
         return {
             "input_ids": input_ids,
+            "past_key_values": past_key_values,
             "use_cache": kwargs.get("use_cache"),
             "attention_mask": attention_mask,
         }