Spaces:
Paused
Paused
Commit
·
7d5c713
1
Parent(s):
b2fe3f4
Reduce context/memory to minimize NCCL overhead on L4s
Browse filesChanges:
- max_model_len: 16384 → 8192 (half context length)
- gpu_memory_utilization: 0.70 → 0.50 (less memory pressure)
L4 GPUs lack NVLINK, making tensor parallelism communication
slow via PCIe. Reducing these values minimizes NCCL overhead.
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
- config/settings.py +1 -1
- models/real.py +1 -1
config/settings.py
CHANGED
|
@@ -25,7 +25,7 @@ class Settings(BaseSettings):
|
|
| 25 |
|
| 26 |
# vLLM configuration
|
| 27 |
vllm_tensor_parallel_size: int = 4 # Use all 4 L4 GPUs
|
| 28 |
-
vllm_max_model_len: int =
|
| 29 |
|
| 30 |
# ChromaDB
|
| 31 |
chroma_persist_dir: str = "./chroma_db"
|
|
|
|
| 25 |
|
| 26 |
# vLLM configuration
|
| 27 |
vllm_tensor_parallel_size: int = 4 # Use all 4 L4 GPUs
|
| 28 |
+
vllm_max_model_len: int = 8192 # Reduced to minimize NCCL overhead on L4s
|
| 29 |
|
| 30 |
# ChromaDB
|
| 31 |
chroma_persist_dir: str = "./chroma_db"
|
models/real.py
CHANGED
|
@@ -92,7 +92,7 @@ class RealModelStack:
|
|
| 92 |
tensor_parallel_size=settings.vllm_tensor_parallel_size,
|
| 93 |
trust_remote_code=True,
|
| 94 |
# dtype removed - FP8 model auto-detects native quantization
|
| 95 |
-
gpu_memory_utilization=0.
|
| 96 |
max_model_len=settings.vllm_max_model_len,
|
| 97 |
# enforce_eager removed - let vLLM default (False) per official
|
| 98 |
)
|
|
|
|
| 92 |
tensor_parallel_size=settings.vllm_tensor_parallel_size,
|
| 93 |
trust_remote_code=True,
|
| 94 |
# dtype removed - FP8 model auto-detects native quantization
|
| 95 |
+
gpu_memory_utilization=0.50, # Reduced to minimize NCCL overhead on L4s
|
| 96 |
max_model_len=settings.vllm_max_model_len,
|
| 97 |
# enforce_eager removed - let vLLM default (False) per official
|
| 98 |
)
|