Spaces:
Paused
Paused
fix(remove-params): Removing max_model_len
Browse files
main.py
CHANGED
|
@@ -12,30 +12,39 @@ app = FastAPI()
|
|
| 12 |
|
| 13 |
# Initialize the LLM engine
|
| 14 |
# Replace 'your-model-path' with the actual path or name of your model
|
|
|
|
|
|
|
| 15 |
|
| 16 |
engine_llama_3_2: LLM = LLM(
|
| 17 |
model='meta-llama/Llama-3.2-3B-Instruct',
|
| 18 |
revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
|
|
|
|
| 19 |
max_num_batched_tokens=512, # Reduced for T4
|
| 20 |
max_num_seqs=16, # Reduced for T4
|
| 21 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
|
|
|
| 22 |
# Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
|
| 23 |
# 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
|
| 24 |
# so that's basically 24k / .5k = 24 x 2 =~48 pages.
|
| 25 |
# Because when we use maximum token length, it will be slower and the memory is not enough for T4.
|
| 26 |
-
|
|
|
|
|
|
|
| 27 |
enforce_eager=True, # Disable CUDA graph
|
| 28 |
dtype='auto', # Use 'half' if you want half precision
|
| 29 |
)
|
| 30 |
|
| 31 |
-
|
|
|
|
|
|
|
| 32 |
engine_sailor_chat: LLM = LLM(
|
| 33 |
model='sail/Sailor-4B-Chat',
|
| 34 |
revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
|
| 35 |
max_num_batched_tokens=512, # Reduced for T4
|
| 36 |
max_num_seqs=16, # Reduced for T4
|
| 37 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
| 38 |
-
|
|
|
|
| 39 |
enforce_eager=True, # Disable CUDA graph
|
| 40 |
dtype='auto', # Use 'half' if you want half precision
|
| 41 |
)
|
|
|
|
| 12 |
|
| 13 |
# Initialize the LLM engine
|
| 14 |
# Replace 'your-model-path' with the actual path or name of your model
|
| 15 |
+
# example:
|
| 16 |
+
# https://huggingface.co/spaces/damienbenveniste/deploy_vLLM/blob/b210a934d4ff7b68254d42fa28736d74649e610d/app.py#L17-L20
|
| 17 |
|
| 18 |
engine_llama_3_2: LLM = LLM(
|
| 19 |
model='meta-llama/Llama-3.2-3B-Instruct',
|
| 20 |
revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
|
| 21 |
+
# https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L1062-L1065
|
| 22 |
max_num_batched_tokens=512, # Reduced for T4
|
| 23 |
max_num_seqs=16, # Reduced for T4
|
| 24 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
| 25 |
+
tensor_parallel_size=2,
|
| 26 |
# Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
|
| 27 |
# 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
|
| 28 |
# so that's basically 24k / .5k = 24 x 2 =~48 pages.
|
| 29 |
# Because when we use maximum token length, it will be slower and the memory is not enough for T4.
|
| 30 |
+
# https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L85-L86
|
| 31 |
+
# https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L98-L102
|
| 32 |
+
# max_model_len=32768,
|
| 33 |
enforce_eager=True, # Disable CUDA graph
|
| 34 |
dtype='auto', # Use 'half' if you want half precision
|
| 35 |
)
|
| 36 |
|
| 37 |
+
# ValueError: max_num_batched_tokens (512) is smaller than max_model_len (32768).
|
| 38 |
+
# This effectively limits the maximum sequence length to max_num_batched_tokens and makes vLLM reject longer sequences.
|
| 39 |
+
# Please increase max_num_batched_tokens or decrease max_model_len.
|
| 40 |
engine_sailor_chat: LLM = LLM(
|
| 41 |
model='sail/Sailor-4B-Chat',
|
| 42 |
revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
|
| 43 |
max_num_batched_tokens=512, # Reduced for T4
|
| 44 |
max_num_seqs=16, # Reduced for T4
|
| 45 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
| 46 |
+
tensor_parallel_size=2,
|
| 47 |
+
# max_model_len=32768,
|
| 48 |
enforce_eager=True, # Disable CUDA graph
|
| 49 |
dtype='auto', # Use 'half' if you want half precision
|
| 50 |
)
|