| """ |
| Usage: |
| python offline_batch_inference_async.py --model-path Qwen/Qwen2-VL-7B-Instruct |
| |
| Note: |
| This demo shows the usage of async generation, |
| which is useful to implement an online-like generation with batched inference. |
| """ |
|
|
| import argparse |
| import asyncio |
| import dataclasses |
| import time |
|
|
| import sglang as sgl |
| from sglang.srt.server_args import ServerArgs |
|
|
|
|
| class InferenceEngine: |
| def __init__(self, **kwargs): |
| self.engine = sgl.Engine(**kwargs) |
|
|
| async def generate(self, prompt, sampling_params): |
| result = await self.engine.async_generate(prompt, sampling_params) |
| return result |
|
|
|
|
| async def run_server(server_args): |
| inference = InferenceEngine(**dataclasses.asdict(server_args)) |
|
|
| |
| prompts = [ |
| "Hello, my name is", |
| "The president of the United States is", |
| "The capital of France is", |
| "The future of AI is", |
| ] * 100 |
|
|
| |
| sampling_params = {"temperature": 0.8, "top_p": 0.95} |
|
|
| |
| tasks = [] |
| for prompt in prompts: |
| task = asyncio.create_task(inference.generate(prompt, sampling_params)) |
| tasks.append(task) |
|
|
| |
| for task in tasks: |
| await task |
| while True: |
| if not task.done(): |
| time.sleep(1) |
| else: |
| result = task.result() |
| print(f"Generated text: {result['text']}") |
| break |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| ServerArgs.add_cli_args(parser) |
| args = parser.parse_args() |
| server_args = ServerArgs.from_cli_args(args) |
| asyncio.run(run_server(server_args)) |
|
|