Spaces:

EQUES
/

JPharmatron_parallel

Paused

shinnosukeono Claude Opus 4.5 commited on Feb 3

Commit

86da8ad

0 Parent(s):

Initial commit: Parallel request processing for JPharmatron

- 8 parallel input/output boxes with streaming
- Multi-GPU architecture (one vLLM engine per GPU)
- Multiprocessing workers for true parallelism
- Individual and batch execution modes

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (3) hide show

README.md +41 -0
app.py +377 -0
requirements.txt +5 -0

README.md ADDED Viewed

	@@ -0,0 +1,41 @@

+---
+title: JPharmatron Parallel Chat
+emoji: 💊
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 5.45.0
+app_file: app.py
+pinned: false
+hardware: nvidia-l40s-x8
+---
+# JPharmatron Parallel Chat
+Parallel request processing interface for JPharmatron-7B-chat model.
+## Features
+- **8 Parallel Request Processing**: Submit up to 8 prompts simultaneously
+- **Independent Streaming Outputs**: Each response streams independently
+- **Multi-GPU Architecture**: One vLLM engine instance per L40S GPU
+- **True Parallelism**: No contention between requests
+## Hardware Requirements
+This Space requires **8x NVIDIA L40S** GPUs (48GB VRAM each).
+- Each 7B model instance uses ~14GB VRAM in fp16
+- 8 independent instances = 8x true throughput
+- No inter-GPU communication overhead
+## Usage
+1. Enter prompts in any of the 8 input text boxes
+2. Select mode options (pharmaceutical expert, international standards, specific procedures)
+3. Click "Run All in Parallel" to execute all prompts simultaneously
+4. Watch responses stream in real-time in their corresponding output boxes
+## Model
+Uses [EQUES/JPharmatron-7B-chat](https://huggingface.co/EQUES/JPharmatron-7B-chat) - a pharmaceutical domain expert model.

app.py ADDED Viewed

	@@ -0,0 +1,377 @@

+import asyncio
+import os
+import re
+import uuid
+import threading
+import queue
+from multiprocessing import Process, Queue
+from typing import Generator
+import gradio as gr
+NUM_GPUS = 8
+# Stop strings for generation
+STOP_STRINGS = [
+    "\nUser:", "\nユーザ：", "\nユーザー：",
+    "\nAssistant:", "\nアシスタント：",
+    "\nHuman:", "\nHuman："
+]
+# Regex for post-processing cleanup
+STOP_RE = re.compile(
+    r"(?:^|\n)(?:User|ユーザ|ユーザー|Assistant|アシスタント)[:：].*",
+    re.MULTILINE
+)
+def gpu_worker_main(gpu_id: int, request_queue: Queue, response_queue: Queue):
+    """
+    Worker process that runs on a dedicated GPU.
+    Each worker has its own vLLM engine instance.
+    """
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+    import asyncio
+    from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams
+    # Initialize vLLM engine on this GPU
+    engine_args = AsyncEngineArgs(
+        model="EQUES/JPharmatron-7B-chat",
+        enforce_eager=True,
+        gpu_memory_utilization=0.85,
+    )
+    engine = AsyncLLMEngine.from_engine_args(engine_args)
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    async def generate_and_stream(request_id: str, prompt: str):
+        """Generate tokens and stream chunks back via response queue."""
+        params = SamplingParams(
+            temperature=0.0,
+            max_tokens=4096,
+            repetition_penalty=1.2,
+            stop=STOP_STRINGS,
+        )
+        previous_text = ""
+        try:
+            async for out in engine.generate(prompt, params, request_id=request_id):
+                full_text = out.outputs[0].text
+                # Check for stop patterns that might have leaked through
+                m = STOP_RE.search(full_text)
+                if m:
+                    cut = m.start()
+                    chunk = full_text[len(previous_text):cut]
+                    if chunk:
+                        response_queue.put((gpu_id, chunk, False))
+                    break
+                chunk = full_text[len(previous_text):]
+                previous_text = full_text
+                if chunk:
+                    response_queue.put((gpu_id, chunk, False))
+        except Exception as e:
+            response_queue.put((gpu_id, f"\n[Error: {str(e)}]", False))
+        # Signal completion
+        response_queue.put((gpu_id, "", True))
+    # Main worker loop
+    while True:
+        try:
+            request = request_queue.get(timeout=1.0)
+        except:
+            continue
+        if request is None:  # Shutdown signal
+            break
+        request_id, prompt = request
+        loop.run_until_complete(generate_and_stream(request_id, prompt))
+class ParallelInferenceManager:
+    """Manages multiple GPU worker processes for parallel inference."""
+    def __init__(self, num_gpus: int = NUM_GPUS):
+        self.num_gpus = num_gpus
+        self.workers = []
+        self.request_queues = []
+        self.response_queue = Queue()  # Shared response queue
+        self._started = False
+    def start(self):
+        """Start all GPU worker processes."""
+        if self._started:
+            return
+        for gpu_id in range(self.num_gpus):
+            request_queue = Queue()
+            self.request_queues.append(request_queue)
+            process = Process(
+                target=gpu_worker_main,
+                args=(gpu_id, request_queue, self.response_queue),
+                daemon=True
+            )
+            process.start()
+            self.workers.append(process)
+        self._started = True
+    def submit_request(self, gpu_id: int, prompt: str, request_id: str):
+        """Submit a request to a specific GPU worker."""
+        if 0 <= gpu_id < self.num_gpus:
+            self.request_queues[gpu_id].put((request_id, prompt))
+    def shutdown(self):
+        """Shutdown all workers."""
+        for q in self.request_queues:
+            q.put(None)
+        for w in self.workers:
+            w.join(timeout=5)
+            if w.is_alive():
+                w.terminate()
+# Global manager instance (initialized lazily)
+_manager = None
+_manager_lock = threading.Lock()
+def get_manager() -> ParallelInferenceManager:
+    """Get or create the global inference manager."""
+    global _manager
+    if _manager is None:
+        with _manager_lock:
+            if _manager is None:
+                _manager = ParallelInferenceManager(NUM_GPUS)
+                _manager.start()
+    return _manager
+def build_prompt(user_input: str, mode: list[str]) -> str:
+    """Build the prompt with system instructions and mode settings."""
+    base_prompt = "あなたは製薬に関する専門家です。製薬に関するユーザーの質問に親切に回答してください。参照した文献を回答の末尾に常に提示してください。\n"
+    if "製薬の専門家" in mode:
+        base_prompt += "あなたは製薬に関する専門家���す。製薬に関するユーザーの質問に親切に回答してください。参照した文献は常に提示してください。\n"
+    if "国際基準に準拠" in mode:
+        base_prompt += "回答に際して、国際基準に準拠してください。\n"
+    if "具体的な手順" in mode:
+        base_prompt += "回答には具体的な作業手順を含めてください。\n"
+    base_prompt += f"ユーザー: {user_input}\nアシスタント:"
+    return base_prompt
+def respond_parallel(
+    prompt0: str, prompt1: str, prompt2: str, prompt3: str,
+    prompt4: str, prompt5: str, prompt6: str, prompt7: str,
+    mode: list[str]
+) -> Generator:
+    """
+    Process up to 8 prompts in parallel, streaming results back.
+    Each prompt is sent to a dedicated GPU worker.
+    """
+    prompts = [prompt0, prompt1, prompt2, prompt3, prompt4, prompt5, prompt6, prompt7]
+    manager = get_manager()
+    # Track active requests and their results
+    results = [""] * NUM_GPUS
+    active_gpus = set()
+    # Submit non-empty prompts to their respective GPUs
+    for gpu_id, prompt in enumerate(prompts):
+        if prompt and prompt.strip():
+            full_prompt = build_prompt(prompt.strip(), mode)
+            request_id = f"req_{gpu_id}_{uuid.uuid4().hex[:8]}"
+            manager.submit_request(gpu_id, full_prompt, request_id)
+            active_gpus.add(gpu_id)
+            results[gpu_id] = ""  # Initialize result
+        else:
+            results[gpu_id] = ""  # Empty prompt = empty result
+    if not active_gpus:
+        # No prompts to process
+        yield tuple(results)
+        return
+    # Stream results from all active workers
+    while active_gpus:
+        try:
+            gpu_id, chunk, is_done = manager.response_queue.get(timeout=0.1)
+            if is_done:
+                active_gpus.discard(gpu_id)
+            else:
+                results[gpu_id] += chunk
+            # Yield current state of all results
+            yield tuple(results)
+        except:
+            # Timeout - yield current state and continue
+            yield tuple(results)
+def respond_single(gpu_id: int, prompt: str, mode: list[str]) -> Generator:
+    """Process a single prompt on a specific GPU."""
+    manager = get_manager()
+    if not prompt or not prompt.strip():
+        yield ""
+        return
+    full_prompt = build_prompt(prompt.strip(), mode)
+    request_id = f"single_{gpu_id}_{uuid.uuid4().hex[:8]}"
+    manager.submit_request(gpu_id, full_prompt, request_id)
+    result = ""
+    while True:
+        try:
+            recv_gpu_id, chunk, is_done = manager.response_queue.get(timeout=0.1)
+            # Only process responses for our request
+            if recv_gpu_id == gpu_id:
+                if is_done:
+                    break
+                result += chunk
+                yield result
+        except:
+            yield result
+# Build the Gradio interface
+with gr.Blocks(title="JPharmatron Parallel Chat") as demo:
+    gr.Markdown("# 💊 JPharmatron - Parallel Request Processing")
+    gr.Markdown(
+        "Enter up to 8 prompts and process them simultaneously on dedicated GPUs. "
+        "Each response streams independently."
+    )
+    # Mode selection
+    mode = gr.CheckboxGroup(
+        label="モード (Mode)",
+        choices=["製薬の専門家", "国際基準に準拠", "具体的な手順"],
+        value=[],
+    )
+    # Preset examples
+    gr.Markdown("### 🔧 Presets (click to copy)")
+    preset_list = [
+        "グレープフルーツと薬を一緒に飲んじゃだめなんですか？",
+        "新薬の臨床試験（Phase I〜III）の概要を、具体例つきで簡単に教えて。",
+        "ジェネリック医薬品が承認されるまでの流れを、タイムラインで解説して。",
+        "抗生物質の作用機序と耐性菌について説明してください。",
+        "COVID-19ワクチンの開発プロセスを教えてください。",
+        "薬物相互作用の主なメカニズムを教えてください。",
+        "バイオシミラーと先発医薬品の違いは何ですか？",
+        "製薬企業のGMP（Good Manufacturing Practice）について説明してください。",
+    ]
+    # Input section
+    gr.Markdown("### 📝 Input Prompts")
+    with gr.Row():
+        with gr.Column():
+            input_boxes = []
+            for i in range(4):
+                tb = gr.Textbox(
+                    label=f"Prompt {i+1}",
+                    placeholder=f"Enter prompt {i+1}...",
+                    lines=3
+                )
+                input_boxes.append(tb)
+        with gr.Column():
+            for i in range(4, 8):
+                tb = gr.Textbox(
+                    label=f"Prompt {i+1}",
+                    placeholder=f"Enter prompt {i+1}...",
+                    lines=3
+                )
+                input_boxes.append(tb)
+    # Examples that fill multiple boxes
+    gr.Examples(
+        examples=[preset_list[:4], preset_list[4:]],
+        inputs=input_boxes[:4],
+        label="Fill first 4 prompts with presets"
+    )
+    # Control buttons
+    with gr.Row():
+        run_all_btn = gr.Button("🚀 Run All in Parallel", variant="primary", scale=2)
+        clear_inputs_btn = gr.Button("🗑️ Clear Inputs", scale=1)
+        clear_outputs_btn = gr.Button("🗑️ Clear Outputs", scale=1)
+    # Output section
+    gr.Markdown("### 📤 Streaming Outputs")
+    with gr.Row():
+        with gr.Column():
+            output_boxes = []
+            for i in range(4):
+                tb = gr.Textbox(
+                    label=f"Response {i+1}",
+                    lines=10,
+                    interactive=False,
+                    show_copy_button=True
+                )
+                output_boxes.append(tb)
+        with gr.Column():
+            for i in range(4, 8):
+                tb = gr.Textbox(
+                    label=f"Response {i+1}",
+                    lines=10,
+                    interactive=False,
+                    show_copy_button=True
+                )
+                output_boxes.append(tb)
+    # Wire up the "Run All" button
+    run_all_btn.click(
+        fn=respond_parallel,
+        inputs=input_boxes + [mode],
+        outputs=output_boxes
+    )
+    # Clear buttons
+    clear_inputs_btn.click(
+        fn=lambda: tuple([""] * 8),
+        inputs=None,
+        outputs=input_boxes
+    )
+    clear_outputs_btn.click(
+        fn=lambda: tuple([""] * 8),
+        inputs=None,
+        outputs=output_boxes
+    )
+    # Individual run buttons for each slot
+    gr.Markdown("### 🎯 Run Individual Prompts")
+    with gr.Row():
+        for i in range(8):
+            btn = gr.Button(f"Run #{i+1}", size="sm")
+            # Create closure to capture gpu_id
+            def make_single_handler(gpu_id):
+                def handler(prompt, mode):
+                    yield from respond_single(gpu_id, prompt, mode)
+                return handler
+            btn.click(
+                fn=make_single_handler(i),
+                inputs=[input_boxes[i], mode],
+                outputs=[output_boxes[i]]
+            )
+def main():
+    """Entry point for the application."""
+    demo.queue()
+    demo.launch()
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+transformers>=4.40.0
+accelerate>=0.30.0
+gradio>=5.45.0
+vllm>=0.4.0
+torch>=2.2.0