Spaces:
Paused
Paused
Commit
·
3c9a722
1
Parent(s):
ed575b1
Force vLLM V0 engine + reduce max_model_len for stability
Browse files- VLLM_USE_V1=0: Force stable V0 engine instead of V1
- Reduce max_model_len from 32768 to 16384 for memory safety
- Keep NCCL and spawn settings for multi-GPU reliability
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
- config/settings.py +1 -1
- models/real.py +3 -0
config/settings.py
CHANGED
|
@@ -25,7 +25,7 @@ class Settings(BaseSettings):
|
|
| 25 |
|
| 26 |
# vLLM configuration
|
| 27 |
vllm_tensor_parallel_size: int = 4 # Use all 4 L4 GPUs
|
| 28 |
-
vllm_max_model_len: int =
|
| 29 |
|
| 30 |
# ChromaDB
|
| 31 |
chroma_persist_dir: str = "./chroma_db"
|
|
|
|
| 25 |
|
| 26 |
# vLLM configuration
|
| 27 |
vllm_tensor_parallel_size: int = 4 # Use all 4 L4 GPUs
|
| 28 |
+
vllm_max_model_len: int = 16384 # Reduced from 32768 for memory safety
|
| 29 |
|
| 30 |
# ChromaDB
|
| 31 |
chroma_persist_dir: str = "./chroma_db"
|
models/real.py
CHANGED
|
@@ -15,6 +15,9 @@ Model Loading:
|
|
| 15 |
import os
|
| 16 |
|
| 17 |
# vLLM environment variables - MUST be set before importing vLLM
|
|
|
|
|
|
|
|
|
|
| 18 |
# Fix for "Engine core initialization failed" with tensor parallelism
|
| 19 |
# See: https://github.com/vllm-project/vllm/issues/17618
|
| 20 |
os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
|
|
|
| 15 |
import os
|
| 16 |
|
| 17 |
# vLLM environment variables - MUST be set before importing vLLM
|
| 18 |
+
# Force V0 engine (more stable than V1 for multi-GPU)
|
| 19 |
+
os.environ.setdefault("VLLM_USE_V1", "0")
|
| 20 |
+
|
| 21 |
# Fix for "Engine core initialization failed" with tensor parallelism
|
| 22 |
# See: https://github.com/vllm-project/vllm/issues/17618
|
| 23 |
os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|