Hanrui / sglang /examples /runtime /engine /offline_batch_inference_async.py

Add files using upload-large-folder tool

61ba51e verified 25 days ago

1.74 kB

	"""
	Usage:
	python offline_batch_inference_async.py --model-path Qwen/Qwen2-VL-7B-Instruct

	Note:
	This demo shows the usage of async generation,
	which is useful to implement an online-like generation with batched inference.
	"""

	import argparse
	import asyncio
	import dataclasses
	import time

	import sglang as sgl
	from sglang.srt.server_args import ServerArgs


	class InferenceEngine:
	def __init__(self, **kwargs):
	self.engine = sgl.Engine(**kwargs)

	async def generate(self, prompt, sampling_params):
	result = await self.engine.async_generate(prompt, sampling_params)
	return result


	async def run_server(server_args):
	inference = InferenceEngine(**dataclasses.asdict(server_args))

	# Sample prompts.
	prompts = [
	"Hello, my name is",
	"The president of the United States is",
	"The capital of France is",
	"The future of AI is",
	] * 100

	# Create a sampling params object.
	sampling_params = {"temperature": 0.8, "top_p": 0.95}

	# Run the generation tasks concurrently in async mode.
	tasks = []
	for prompt in prompts:
	task = asyncio.create_task(inference.generate(prompt, sampling_params))
	tasks.append(task)

	# Get and print the result
	for task in tasks:
	await task
	while True:
	if not task.done():
	time.sleep(1)
	else:
	result = task.result()
	print(f"Generated text: {result['text']}")
	break


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	ServerArgs.add_cli_args(parser)
	args = parser.parse_args()
	server_args = ServerArgs.from_cli_args(args)
	asyncio.run(run_server(server_args))