Spaces:

yusufs
/

sailor2-3b-chat

Paused

App Files Files Community

yusufs commited on Nov 27, 2024

Commit

0ef012d

1 Parent(s): 586265c

fix(remove-params): Removing max_model_len

Browse files

Files changed (1) hide show

main.py +12 -3

main.py CHANGED Viewed

@@ -12,30 +12,39 @@ app = FastAPI()
 # Initialize the LLM engine
 # Replace 'your-model-path' with the actual path or name of your model
 engine_llama_3_2: LLM = LLM(
     model='meta-llama/Llama-3.2-3B-Instruct',
     revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
     max_num_batched_tokens=512,    # Reduced for T4
     max_num_seqs=16,               # Reduced for T4
     gpu_memory_utilization=0.85,   # Slightly increased, adjust if needed
     # Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
     # 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
     # so that's basically 24k / .5k = 24 x 2 =~48 pages.
     # Because when we use maximum token length, it will be slower and the memory is not enough for T4.
-    max_model_len=32768,
     enforce_eager=True,            # Disable CUDA graph
     dtype='auto',                  # Use 'half' if you want half precision
 )
 engine_sailor_chat: LLM = LLM(
     model='sail/Sailor-4B-Chat',
     revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
     max_num_batched_tokens=512,    # Reduced for T4
     max_num_seqs=16,               # Reduced for T4
     gpu_memory_utilization=0.85,   # Slightly increased, adjust if needed
-    max_model_len=32768,
     enforce_eager=True,            # Disable CUDA graph
     dtype='auto',                  # Use 'half' if you want half precision
 )

 # Initialize the LLM engine
 # Replace 'your-model-path' with the actual path or name of your model
+# example:
+# https://huggingface.co/spaces/damienbenveniste/deploy_vLLM/blob/b210a934d4ff7b68254d42fa28736d74649e610d/app.py#L17-L20
 engine_llama_3_2: LLM = LLM(
     model='meta-llama/Llama-3.2-3B-Instruct',
     revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
+    # https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L1062-L1065
     max_num_batched_tokens=512,    # Reduced for T4
     max_num_seqs=16,               # Reduced for T4
     gpu_memory_utilization=0.85,   # Slightly increased, adjust if needed
+    tensor_parallel_size=2,
     # Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
     # 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
     # so that's basically 24k / .5k = 24 x 2 =~48 pages.
     # Because when we use maximum token length, it will be slower and the memory is not enough for T4.
+    # https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L85-L86
+    # https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L98-L102
+    # max_model_len=32768,
     enforce_eager=True,            # Disable CUDA graph
     dtype='auto',                  # Use 'half' if you want half precision
 )
+# ValueError: max_num_batched_tokens (512) is smaller than max_model_len (32768).
+# This effectively limits the maximum sequence length to max_num_batched_tokens and makes vLLM reject longer sequences.
+# Please increase max_num_batched_tokens or decrease max_model_len.
 engine_sailor_chat: LLM = LLM(
     model='sail/Sailor-4B-Chat',
     revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
     max_num_batched_tokens=512,    # Reduced for T4
     max_num_seqs=16,               # Reduced for T4
     gpu_memory_utilization=0.85,   # Slightly increased, adjust if needed
+    tensor_parallel_size=2,
+    # max_model_len=32768,
     enforce_eager=True,            # Disable CUDA graph
     dtype='auto',                  # Use 'half' if you want half precision
 )