Text Generation
Transformers
Safetensors
multilingual
phi3
nlp
code
conversational
custom_code
text-generation-inference
Instructions to use Lexius/Phi-3.5-mini-instruct with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Lexius/Phi-3.5-mini-instruct with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="Lexius/Phi-3.5-mini-instruct", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("Lexius/Phi-3.5-mini-instruct", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("Lexius/Phi-3.5-mini-instruct", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use Lexius/Phi-3.5-mini-instruct with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Lexius/Phi-3.5-mini-instruct" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Lexius/Phi-3.5-mini-instruct", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/Lexius/Phi-3.5-mini-instruct
- SGLang
How to use Lexius/Phi-3.5-mini-instruct with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Lexius/Phi-3.5-mini-instruct" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Lexius/Phi-3.5-mini-instruct", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Lexius/Phi-3.5-mini-instruct" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Lexius/Phi-3.5-mini-instruct", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use Lexius/Phi-3.5-mini-instruct with Docker Model Runner:
docker model run hf.co/Lexius/Phi-3.5-mini-instruct
support transformers 4.55
#1
by lerignoux - opened
- modeling_phi3.py +54 -9
modeling_phi3.py
CHANGED
|
@@ -330,7 +330,14 @@ class Phi3Attention(nn.Module):
|
|
| 330 |
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
|
| 331 |
"with a layer index."
|
| 332 |
)
|
| 333 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
|
| 335 |
|
| 336 |
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
|
|
@@ -448,7 +455,14 @@ class Phi3FlashAttention2(Phi3Attention):
|
|
| 448 |
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
|
| 449 |
"with a layer index."
|
| 450 |
)
|
| 451 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
|
| 453 |
# Because the input can be padded, the absolute sequence length depends on the max position id.
|
| 454 |
rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item() + 1)
|
|
@@ -738,7 +752,14 @@ class Phi3SdpaAttention(Phi3Attention):
|
|
| 738 |
|
| 739 |
kv_seq_len = key_states.shape[-2]
|
| 740 |
if past_key_value is not None:
|
| 741 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 742 |
cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
|
| 743 |
|
| 744 |
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
|
|
@@ -1060,7 +1081,15 @@ class Phi3Model(Phi3PreTrainedModel):
|
|
| 1060 |
use_legacy_cache = not isinstance(past_key_values, Cache)
|
| 1061 |
if use_legacy_cache:
|
| 1062 |
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
|
| 1063 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1064 |
|
| 1065 |
if position_ids is None:
|
| 1066 |
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
|
@@ -1288,19 +1317,35 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
|
|
| 1288 |
# When the first time input length reached long and short factor switching point, enforce re-compute cache
|
| 1289 |
# It will cause downside of slower at this single token position, however, better than current failure.
|
| 1290 |
if past_key_values and self.config.rope_scaling and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1:
|
| 1291 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1292 |
if past_length <= self.config.original_max_position_embeddings:
|
| 1293 |
past_key_values = None
|
| 1294 |
|
|
|
|
|
|
|
|
|
|
| 1295 |
if past_key_values is not None:
|
| 1296 |
-
if isinstance(past_key_values, Cache):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1297 |
cache_length = past_key_values.get_seq_length()
|
| 1298 |
past_length = past_key_values.seen_tokens
|
| 1299 |
-
# Fixing AttributeError: 'DynamicCache' object has no attribute 'get_max_length'
|
| 1300 |
-
# https://github.com/huggingface/transformers/issues/36071
|
| 1301 |
-
# max_cache_length = past_key_values.get_max_length()
|
| 1302 |
max_cache_length = past_key_values.get_max_cache_shape()
|
| 1303 |
else:
|
|
|
|
| 1304 |
cache_length = past_length = past_key_values[0][0].shape[2]
|
| 1305 |
max_cache_length = None
|
| 1306 |
|
|
|
|
| 330 |
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
|
| 331 |
"with a layer index."
|
| 332 |
)
|
| 333 |
+
if not hasattr(past_key_value, 'get_usable_length'):
|
| 334 |
+
# Transformers >= 4.55
|
| 335 |
+
past_len = past_key_value.get_seq_length(self.layer_idx) if past_key_value is not None else 0
|
| 336 |
+
kv_seq_len += past_len
|
| 337 |
+
else:
|
| 338 |
+
# Transformers < 4.55
|
| 339 |
+
kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
|
| 340 |
+
|
| 341 |
cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
|
| 342 |
|
| 343 |
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
|
|
|
|
| 455 |
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
|
| 456 |
"with a layer index."
|
| 457 |
)
|
| 458 |
+
if not hasattr(past_key_value, 'get_usable_length'):
|
| 459 |
+
# Transformers >= 4.55
|
| 460 |
+
past_len = past_key_value.get_seq_length(self.layer_idx) if past_key_value is not None else 0
|
| 461 |
+
kv_seq_len += past_len
|
| 462 |
+
else:
|
| 463 |
+
# Transformers < 4.55
|
| 464 |
+
kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
|
| 465 |
+
|
| 466 |
|
| 467 |
# Because the input can be padded, the absolute sequence length depends on the max position id.
|
| 468 |
rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item() + 1)
|
|
|
|
| 752 |
|
| 753 |
kv_seq_len = key_states.shape[-2]
|
| 754 |
if past_key_value is not None:
|
| 755 |
+
if not hasattr(past_key_value, 'get_usable_length'):
|
| 756 |
+
# Transformers >= 4.55
|
| 757 |
+
past_len = past_key_value.get_seq_length(self.layer_idx) if past_key_value is not None else 0
|
| 758 |
+
kv_seq_len += past_len
|
| 759 |
+
else:
|
| 760 |
+
# Transformers < 4.55
|
| 761 |
+
kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
|
| 762 |
+
|
| 763 |
cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
|
| 764 |
|
| 765 |
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
|
|
|
|
| 1081 |
use_legacy_cache = not isinstance(past_key_values, Cache)
|
| 1082 |
if use_legacy_cache:
|
| 1083 |
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
|
| 1084 |
+
if isinstance(past_key_values, Cache) and not hasattr(past_key_values, 'get_usable_length'):
|
| 1085 |
+
# Transformers >= 4.55
|
| 1086 |
+
past_key_values_length = past_key_values.get_seq_length()
|
| 1087 |
+
elif isinstance(past_key_values, Cache):
|
| 1088 |
+
# 4.49 <= Transformers < 4.55
|
| 1089 |
+
past_key_values_length = past_key_values.get_usable_length(seq_length)
|
| 1090 |
+
else:
|
| 1091 |
+
# No cache given on first forward, keep length at 0
|
| 1092 |
+
past_key_values_length = 0
|
| 1093 |
|
| 1094 |
if position_ids is None:
|
| 1095 |
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
|
|
|
| 1317 |
# When the first time input length reached long and short factor switching point, enforce re-compute cache
|
| 1318 |
# It will cause downside of slower at this single token position, however, better than current failure.
|
| 1319 |
if past_key_values and self.config.rope_scaling and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1:
|
| 1320 |
+
if isinstance(past_key_values, Cache) and not hasattr(past_key_values, 'seen_tokens'):
|
| 1321 |
+
# Transformers > 4.55
|
| 1322 |
+
cache_length = past_key_values.get_seq_length()
|
| 1323 |
+
past_length = cache_length
|
| 1324 |
+
elif isinstance(past_key_values, Cache):
|
| 1325 |
+
# 4.49 <= Transformers < 4.55
|
| 1326 |
+
past_length = past_key_values.seen_tokens
|
| 1327 |
+
else:
|
| 1328 |
+
# Transformers < 4.49
|
| 1329 |
+
past_key_values[0][0].shape[2]
|
| 1330 |
if past_length <= self.config.original_max_position_embeddings:
|
| 1331 |
past_key_values = None
|
| 1332 |
|
| 1333 |
+
cache_length = None
|
| 1334 |
+
past_length = None
|
| 1335 |
+
max_cache_length = None
|
| 1336 |
if past_key_values is not None:
|
| 1337 |
+
if isinstance(past_key_values, Cache) and not hasattr(past_key_values, 'seen_tokens'):
|
| 1338 |
+
# Transformers > 4.55
|
| 1339 |
+
cache_length = past_key_values.get_seq_length()
|
| 1340 |
+
past_length = cache_length
|
| 1341 |
+
max_cache_length = past_key_values.get_max_cache_shape()
|
| 1342 |
+
elif isinstance(past_key_values, Cache):
|
| 1343 |
+
# 4.49 <= Transformers < 4.55
|
| 1344 |
cache_length = past_key_values.get_seq_length()
|
| 1345 |
past_length = past_key_values.seen_tokens
|
|
|
|
|
|
|
|
|
|
| 1346 |
max_cache_length = past_key_values.get_max_cache_shape()
|
| 1347 |
else:
|
| 1348 |
+
# Transformers < 4.49
|
| 1349 |
cache_length = past_length = past_key_values[0][0].shape[2]
|
| 1350 |
max_cache_length = None
|
| 1351 |
|