Forrest Wargo commited on
Commit
eccbc24
·
1 Parent(s): 222ced7

fixing vllms

Browse files
Files changed (1) hide show
  1. handler.py +4 -0
handler.py CHANGED
@@ -59,6 +59,8 @@ class EndpointHandler:
59
  pass
60
 
61
  # Auto-detect tensor parallel size from visible devices
 
 
62
  visible = os.environ.get("CUDA_VISIBLE_DEVICES")
63
  if visible and visible.strip():
64
  try:
@@ -80,6 +82,8 @@ class EndpointHandler:
80
  pipeline_parallel_size=1,
81
  gpu_memory_utilization=0.95,
82
  dtype="auto",
 
 
83
  trust_remote_code=True,
84
  )
85
  self.processor = AutoProcessor.from_pretrained(
 
59
  pass
60
 
61
  # Auto-detect tensor parallel size from visible devices
62
+ # Some server environments require 'fork' for worker processes
63
+ os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "fork")
64
  visible = os.environ.get("CUDA_VISIBLE_DEVICES")
65
  if visible and visible.strip():
66
  try:
 
82
  pipeline_parallel_size=1,
83
  gpu_memory_utilization=0.95,
84
  dtype="auto",
85
+ distributed_executor_backend="mp",
86
+ enforce_eager=True,
87
  trust_remote_code=True,
88
  )
89
  self.processor = AutoProcessor.from_pretrained(