KinetoLabs Claude Opus 4.5 commited on
Commit
3c9a722
·
1 Parent(s): ed575b1

Force vLLM V0 engine + reduce max_model_len for stability

Browse files

- VLLM_USE_V1=0: Force stable V0 engine instead of V1
- Reduce max_model_len from 32768 to 16384 for memory safety
- Keep NCCL and spawn settings for multi-GPU reliability

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (2) hide show
  1. config/settings.py +1 -1
  2. models/real.py +3 -0
config/settings.py CHANGED
@@ -25,7 +25,7 @@ class Settings(BaseSettings):
25
 
26
  # vLLM configuration
27
  vllm_tensor_parallel_size: int = 4 # Use all 4 L4 GPUs
28
- vllm_max_model_len: int = 32768 # Context window
29
 
30
  # ChromaDB
31
  chroma_persist_dir: str = "./chroma_db"
 
25
 
26
  # vLLM configuration
27
  vllm_tensor_parallel_size: int = 4 # Use all 4 L4 GPUs
28
+ vllm_max_model_len: int = 16384 # Reduced from 32768 for memory safety
29
 
30
  # ChromaDB
31
  chroma_persist_dir: str = "./chroma_db"
models/real.py CHANGED
@@ -15,6 +15,9 @@ Model Loading:
15
  import os
16
 
17
  # vLLM environment variables - MUST be set before importing vLLM
 
 
 
18
  # Fix for "Engine core initialization failed" with tensor parallelism
19
  # See: https://github.com/vllm-project/vllm/issues/17618
20
  os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
 
15
  import os
16
 
17
  # vLLM environment variables - MUST be set before importing vLLM
18
+ # Force V0 engine (more stable than V1 for multi-GPU)
19
+ os.environ.setdefault("VLLM_USE_V1", "0")
20
+
21
  # Fix for "Engine core initialization failed" with tensor parallelism
22
  # See: https://github.com/vllm-project/vllm/issues/17618
23
  os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn")