Spaces:

EQUES
/

JPharmatron_parallel

Paused

shinnosukeono Claude Opus 4.5 commited on Feb 3

Commit

4161fbd

1 Parent(s): 0ab0126

Fix: Use single vLLM engine with continuous batching

- Detect available GPUs dynamically
- Use vLLM's built-in continuous batching for concurrent requests
- Remove multiprocessing approach that failed on HF Spaces
- Use tensor_parallel_size for multi-GPU when available

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (1) hide show

app.py +160 -191

app.py CHANGED Viewed

@@ -3,13 +3,31 @@ import os
 import re
 import uuid
 import threading
-import queue
-from multiprocessing import Process, Queue
 from typing import Generator
 import gradio as gr
-NUM_GPUS = 8
 # Stop strings for generation
 STOP_STRINGS = [
@@ -23,134 +41,30 @@ STOP_RE = re.compile(
     re.MULTILINE
 )
-def gpu_worker_main(gpu_id: int, request_queue: Queue, response_queue: Queue):
-    """
-    Worker process that runs on a dedicated GPU.
-    Each worker has its own vLLM engine instance.
-    """
-    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
-    import asyncio
-    from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams
-    # Initialize vLLM engine on this GPU
-    engine_args = AsyncEngineArgs(
-        model="EQUES/JPharmatron-7B-chat",
-        enforce_eager=True,
-        gpu_memory_utilization=0.85,
-    )
-    engine = AsyncLLMEngine.from_engine_args(engine_args)
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-    async def generate_and_stream(request_id: str, prompt: str):
-        """Generate tokens and stream chunks back via response queue."""
-        params = SamplingParams(
-            temperature=0.0,
-            max_tokens=4096,
-            repetition_penalty=1.2,
-            stop=STOP_STRINGS,
-        )
-        previous_text = ""
-        try:
-            async for out in engine.generate(prompt, params, request_id=request_id):
-                full_text = out.outputs[0].text
-                # Check for stop patterns that might have leaked through
-                m = STOP_RE.search(full_text)
-                if m:
-                    cut = m.start()
-                    chunk = full_text[len(previous_text):cut]
-                    if chunk:
-                        response_queue.put((gpu_id, chunk, False))
-                    break
-                chunk = full_text[len(previous_text):]
-                previous_text = full_text
-                if chunk:
-                    response_queue.put((gpu_id, chunk, False))
-        except Exception as e:
-            response_queue.put((gpu_id, f"\n[Error: {str(e)}]", False))
-        # Signal completion
-        response_queue.put((gpu_id, "", True))
-    # Main worker loop
-    while True:
-        try:
-            request = request_queue.get(timeout=1.0)
-        except:
-            continue
-        if request is None:  # Shutdown signal
-            break
-        request_id, prompt = request
-        loop.run_until_complete(generate_and_stream(request_id, prompt))
-class ParallelInferenceManager:
-    """Manages multiple GPU worker processes for parallel inference."""
-    def __init__(self, num_gpus: int = NUM_GPUS):
-        self.num_gpus = num_gpus
-        self.workers = []
-        self.request_queues = []
-        self.response_queue = Queue()  # Shared response queue
-        self._started = False
-    def start(self):
-        """Start all GPU worker processes."""
-        if self._started:
-            return
-        for gpu_id in range(self.num_gpus):
-            request_queue = Queue()
-            self.request_queues.append(request_queue)
-            process = Process(
-                target=gpu_worker_main,
-                args=(gpu_id, request_queue, self.response_queue),
-                daemon=True
-            )
-            process.start()
-            self.workers.append(process)
-        self._started = True
-    def submit_request(self, gpu_id: int, prompt: str, request_id: str):
-        """Submit a request to a specific GPU worker."""
-        if 0 <= gpu_id < self.num_gpus:
-            self.request_queues[gpu_id].put((request_id, prompt))
-    def shutdown(self):
-        """Shutdown all workers."""
-        for q in self.request_queues:
-            q.put(None)
-        for w in self.workers:
-            w.join(timeout=5)
-            if w.is_alive():
-                w.terminate()
-# Global manager instance (initialized lazily)
-_manager = None
-_manager_lock = threading.Lock()
-def get_manager() -> ParallelInferenceManager:
-    """Get or create the global inference manager."""
-    global _manager
-    if _manager is None:
-        with _manager_lock:
-            if _manager is None:
-                _manager = ParallelInferenceManager(NUM_GPUS)
-                _manager.start()
-    return _manager
 def build_prompt(user_input: str, mode: list[str]) -> str:
@@ -168,90 +82,145 @@ def build_prompt(user_input: str, mode: list[str]) -> str:
     return base_prompt
-def respond_parallel(
-    prompt0: str, prompt1: str, prompt2: str, prompt3: str,
-    prompt4: str, prompt5: str, prompt6: str, prompt7: str,
-    mode: list[str]
-) -> Generator:
     """
-    Process up to 8 prompts in parallel, streaming results back.
-    Each prompt is sent to a dedicated GPU worker.
     """
-    prompts = [prompt0, prompt1, prompt2, prompt3, prompt4, prompt5, prompt6, prompt7]
-    manager = get_manager()
-    # Track active requests and their results
-    results = [""] * NUM_GPUS
-    active_gpus = set()
-    # Submit non-empty prompts to their respective GPUs
-    for gpu_id, prompt in enumerate(prompts):
         if prompt and prompt.strip():
             full_prompt = build_prompt(prompt.strip(), mode)
-            request_id = f"req_{gpu_id}_{uuid.uuid4().hex[:8]}"
-            manager.submit_request(gpu_id, full_prompt, request_id)
-            active_gpus.add(gpu_id)
-            results[gpu_id] = ""  # Initialize result
-        else:
-            results[gpu_id] = ""  # Empty prompt = empty result
-    if not active_gpus:
-        # No prompts to process
-        yield tuple(results)
         return
-    # Stream results from all active workers
-    while active_gpus:
-        try:
-            gpu_id, chunk, is_done = manager.response_queue.get(timeout=0.1)
-            if is_done:
-                active_gpus.discard(gpu_id)
-            else:
-                results[gpu_id] += chunk
-            # Yield current state of all results
-            yield tuple(results)
-        except:
-            # Timeout - yield current state and continue
             yield tuple(results)
-def respond_single(gpu_id: int, prompt: str, mode: list[str]) -> Generator:
-    """Process a single prompt on a specific GPU."""
-    manager = get_manager()
     if not prompt or not prompt.strip():
         yield ""
         return
     full_prompt = build_prompt(prompt.strip(), mode)
-    request_id = f"single_{gpu_id}_{uuid.uuid4().hex[:8]}"
-    manager.submit_request(gpu_id, full_prompt, request_id)
-    result = ""
-    while True:
-        try:
-            recv_gpu_id, chunk, is_done = manager.response_queue.get(timeout=0.1)
-            # Only process responses for our request
-            if recv_gpu_id == gpu_id:
-                if is_done:
-                    break
-                result += chunk
-                yield result
-        except:
             yield result
 # Build the Gradio interface
 with gr.Blocks(title="JPharmatron Parallel Chat") as demo:
     gr.Markdown("# 💊 JPharmatron - Parallel Request Processing")
     gr.Markdown(
-        "Enter up to 8 prompts and process them simultaneously on dedicated GPUs. "
-        "Each response streams independently."
     )
     # Mode selection
@@ -355,10 +324,10 @@ with gr.Blocks(title="JPharmatron Parallel Chat") as demo:
     with gr.Row():
         for i in range(8):
             btn = gr.Button(f"Run #{i+1}", size="sm")
-            # Create closure to capture gpu_id
-            def make_single_handler(gpu_id):
                 def handler(prompt, mode):
-                    yield from respond_single(gpu_id, prompt, mode)
                 return handler
             btn.click(
                 fn=make_single_handler(i),

 import re
 import uuid
 import threading
 from typing import Generator
 import gradio as gr
+# Detect available GPUs
+def get_num_gpus() -> int:
+    """Detect the number of available GPUs."""
+    try:
+        import torch
+        if torch.cuda.is_available():
+            return torch.cuda.device_count()
+    except ImportError:
+        pass
+    # Fallback: check CUDA_VISIBLE_DEVICES
+    cuda_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "")
+    if cuda_devices:
+        return len(cuda_devices.split(","))
+    return 1  # Default to 1
+NUM_GPUS = get_num_gpus()
+MAX_PARALLEL_REQUESTS = 8  # UI supports up to 8 parallel inputs
+print(f"Detected {NUM_GPUS} GPU(s)")
 # Stop strings for generation
 STOP_STRINGS = [
     re.MULTILINE
 )
+# Global vLLM engine (single instance, handles concurrent requests internally)
+_engine = None
+_engine_lock = threading.Lock()
+_loop = None
+def get_engine():
+    """Get or create the global vLLM engine."""
+    global _engine, _loop
+    if _engine is None:
+        with _engine_lock:
+            if _engine is None:
+                from vllm import AsyncLLMEngine, AsyncEngineArgs
+                engine_args = AsyncEngineArgs(
+                    model="EQUES/JPharmatron-7B-chat",
+                    enforce_eager=True,
+                    gpu_memory_utilization=0.85,
+                    tensor_parallel_size=NUM_GPUS,  # Use all available GPUs
+                )
+                _engine = AsyncLLMEngine.from_engine_args(engine_args)
+                _loop = asyncio.new_event_loop()
+                asyncio.set_event_loop(_loop)
+    return _engine, _loop
 def build_prompt(user_input: str, mode: list[str]) -> str:
     return base_prompt
+async def astream_generate(engine, prompt: str, request_id: str):
+    """Async generator that streams tokens from vLLM."""
+    from vllm import SamplingParams
+    params = SamplingParams(
+        temperature=0.0,
+        max_tokens=4096,
+        repetition_penalty=1.2,
+        stop=STOP_STRINGS,
+    )
+    previous_text = ""
+    async for out in engine.generate(prompt, params, request_id=request_id):
+        full_text = out.outputs[0].text
+        # Check for stop patterns that might have leaked through
+        m = STOP_RE.search(full_text)
+        if m:
+            cut = m.start()
+            chunk = full_text[len(previous_text):cut]
+            if chunk:
+                yield chunk
+            break
+        chunk = full_text[len(previous_text):]
+        previous_text = full_text
+        if chunk:
+            yield chunk
+async def run_parallel_async(prompts: list[str], mode: list[str]):
     """
+    Run multiple prompts in parallel using vLLM's continuous batching.
+    Yields (slot_id, accumulated_text) tuples as tokens arrive.
     """
+    engine, _ = get_engine()
+    # Build full prompts and track active slots
+    active_slots = {}
+    results = [""] * MAX_PARALLEL_REQUESTS
+    for i, prompt in enumerate(prompts):
         if prompt and prompt.strip():
             full_prompt = build_prompt(prompt.strip(), mode)
+            request_id = f"req_{i}_{uuid.uuid4().hex[:8]}"
+            active_slots[i] = {
+                "request_id": request_id,
+                "prompt": full_prompt,
+                "generator": None,
+                "done": False,
+            }
+    if not active_slots:
+        yield results
         return
+    # Start all generators
+    for slot_id, slot_info in active_slots.items():
+        slot_info["generator"] = astream_generate(
+            engine, slot_info["prompt"], slot_info["request_id"]
+        )
+    # Poll all generators and yield updates
+    while any(not slot["done"] for slot in active_slots.values()):
+        for slot_id, slot_info in active_slots.items():
+            if slot_info["done"]:
+                continue
+            try:
+                # Try to get next chunk with a small timeout
+                chunk = await asyncio.wait_for(
+                    slot_info["generator"].__anext__(),
+                    timeout=0.05
+                )
+                results[slot_id] += chunk
+            except StopAsyncIteration:
+                slot_info["done"] = True
+            except asyncio.TimeoutError:
+                pass  # No data ready, continue to next slot
+        yield results
+def respond_parallel(
+    prompt0: str, prompt1: str, prompt2: str, prompt3: str,
+    prompt4: str, prompt5: str, prompt6: str, prompt7: str,
+    mode: list[str]
+) -> Generator:
+    """
+    Process up to 8 prompts in parallel using vLLM's continuous batching.
+    """
+    prompts = [prompt0, prompt1, prompt2, prompt3, prompt4, prompt5, prompt6, prompt7]
+    _, loop = get_engine()
+    async def run():
+        async for results in run_parallel_async(prompts, mode):
             yield tuple(results)
+    # Run the async generator in the event loop
+    agen = run()
+    try:
+        while True:
+            results = loop.run_until_complete(agen.__anext__())
+            yield results
+    except StopAsyncIteration:
+        return
+def respond_single(slot_id: int, prompt: str, mode: list[str]) -> Generator:
+    """Process a single prompt."""
     if not prompt or not prompt.strip():
         yield ""
         return
+    engine, loop = get_engine()
     full_prompt = build_prompt(prompt.strip(), mode)
+    request_id = f"single_{slot_id}_{uuid.uuid4().hex[:8]}"
+    async def run():
+        result = ""
+        async for chunk in astream_generate(engine, full_prompt, request_id):
+            result += chunk
             yield result
+    agen = run()
+    try:
+        while True:
+            result = loop.run_until_complete(agen.__anext__())
+            yield result
+    except StopAsyncIteration:
+        return
 # Build the Gradio interface
 with gr.Blocks(title="JPharmatron Parallel Chat") as demo:
     gr.Markdown("# 💊 JPharmatron - Parallel Request Processing")
     gr.Markdown(
+        f"Enter up to {MAX_PARALLEL_REQUESTS} prompts and process them simultaneously. "
+        f"Using {NUM_GPUS} GPU(s) with vLLM continuous batching."
     )
     # Mode selection
     with gr.Row():
         for i in range(8):
             btn = gr.Button(f"Run #{i+1}", size="sm")
+            # Create closure to capture slot_id
+            def make_single_handler(slot_id):
                 def handler(prompt, mode):
+                    yield from respond_single(slot_id, prompt, mode)
                 return handler
             btn.click(
                 fn=make_single_handler(i),