Text Generation
Transformers
PyTorch
English
falcon
custom_code
Eval Results
text-generation-inference
Instructions to use tiiuae/falcon-40b-instruct with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use tiiuae/falcon-40b-instruct with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="tiiuae/falcon-40b-instruct", trust_remote_code=True)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-40b-instruct", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b-instruct", trust_remote_code=True) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use tiiuae/falcon-40b-instruct with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "tiiuae/falcon-40b-instruct" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "tiiuae/falcon-40b-instruct", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/tiiuae/falcon-40b-instruct
- SGLang
How to use tiiuae/falcon-40b-instruct with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "tiiuae/falcon-40b-instruct" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "tiiuae/falcon-40b-instruct", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "tiiuae/falcon-40b-instruct" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "tiiuae/falcon-40b-instruct", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use tiiuae/falcon-40b-instruct with Docker Model Runner:
docker model run hf.co/tiiuae/falcon-40b-instruct
Use input attention mask instead of casual mask in attention
#72
by CyberZHG - opened
- modelling_RW.py +2 -2
modelling_RW.py
CHANGED
|
@@ -281,13 +281,14 @@ class Attention(nn.Module):
|
|
| 281 |
else:
|
| 282 |
present = None
|
| 283 |
|
|
|
|
| 284 |
if alibi is None:
|
| 285 |
query_layer_ = query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim)
|
| 286 |
key_layer_ = key_layer.reshape(batch_size, self.num_heads, -1, self.head_dim)
|
| 287 |
value_layer_ = value_layer.reshape(batch_size, self.num_heads, -1, self.head_dim)
|
| 288 |
|
| 289 |
attn_output = F.scaled_dot_product_attention(
|
| 290 |
-
query_layer_, key_layer_, value_layer_,
|
| 291 |
)
|
| 292 |
|
| 293 |
x = attn_output.view(batch_size, self.num_heads, q_length, self.head_dim)
|
|
@@ -300,7 +301,6 @@ class Attention(nn.Module):
|
|
| 300 |
assert not output_attentions # not supported.
|
| 301 |
return outputs
|
| 302 |
else:
|
| 303 |
-
attention_mask_float = (attention_mask * 1.0).masked_fill(attention_mask, -1e9).to(torch.bfloat16)
|
| 304 |
matmul_result = query_layer @ key_layer.transpose(-1, -2)
|
| 305 |
|
| 306 |
# change view to [batch_size, num_heads, q_length, kv_length]
|
|
|
|
| 281 |
else:
|
| 282 |
present = None
|
| 283 |
|
| 284 |
+
attention_mask_float = (attention_mask * 1.0).masked_fill(attention_mask, -1e9).to(query_layer.dtype)
|
| 285 |
if alibi is None:
|
| 286 |
query_layer_ = query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim)
|
| 287 |
key_layer_ = key_layer.reshape(batch_size, self.num_heads, -1, self.head_dim)
|
| 288 |
value_layer_ = value_layer.reshape(batch_size, self.num_heads, -1, self.head_dim)
|
| 289 |
|
| 290 |
attn_output = F.scaled_dot_product_attention(
|
| 291 |
+
query_layer_, key_layer_, value_layer_, attention_mask_float, 0.0, is_causal=False
|
| 292 |
)
|
| 293 |
|
| 294 |
x = attn_output.view(batch_size, self.num_heads, q_length, self.head_dim)
|
|
|
|
| 301 |
assert not output_attentions # not supported.
|
| 302 |
return outputs
|
| 303 |
else:
|
|
|
|
| 304 |
matmul_result = query_layer @ key_layer.transpose(-1, -2)
|
| 305 |
|
| 306 |
# change view to [batch_size, num_heads, q_length, kv_length]
|