Forrest Wargo
commited on
Commit
·
eccbc24
1
Parent(s):
222ced7
fixing vllms
Browse files- handler.py +4 -0
handler.py
CHANGED
|
@@ -59,6 +59,8 @@ class EndpointHandler:
|
|
| 59 |
pass
|
| 60 |
|
| 61 |
# Auto-detect tensor parallel size from visible devices
|
|
|
|
|
|
|
| 62 |
visible = os.environ.get("CUDA_VISIBLE_DEVICES")
|
| 63 |
if visible and visible.strip():
|
| 64 |
try:
|
|
@@ -80,6 +82,8 @@ class EndpointHandler:
|
|
| 80 |
pipeline_parallel_size=1,
|
| 81 |
gpu_memory_utilization=0.95,
|
| 82 |
dtype="auto",
|
|
|
|
|
|
|
| 83 |
trust_remote_code=True,
|
| 84 |
)
|
| 85 |
self.processor = AutoProcessor.from_pretrained(
|
|
|
|
| 59 |
pass
|
| 60 |
|
| 61 |
# Auto-detect tensor parallel size from visible devices
|
| 62 |
+
# Some server environments require 'fork' for worker processes
|
| 63 |
+
os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "fork")
|
| 64 |
visible = os.environ.get("CUDA_VISIBLE_DEVICES")
|
| 65 |
if visible and visible.strip():
|
| 66 |
try:
|
|
|
|
| 82 |
pipeline_parallel_size=1,
|
| 83 |
gpu_memory_utilization=0.95,
|
| 84 |
dtype="auto",
|
| 85 |
+
distributed_executor_backend="mp",
|
| 86 |
+
enforce_eager=True,
|
| 87 |
trust_remote_code=True,
|
| 88 |
)
|
| 89 |
self.processor = AutoProcessor.from_pretrained(
|