| import asyncio | |
| import os | |
| from openai import AsyncOpenAI | |
| from sglang.utils import launch_server_cmd, terminate_process, wait_for_server | |
| from tap import Tap | |
| from tqdm.auto import tqdm | |
| from collections.abc import Coroutine, Sequence | |
| def limit_concurrency( | |
| coroutines: Sequence[Coroutine], concurrency: int | |
| ) -> list[Coroutine]: | |
| semaphore = asyncio.Semaphore(concurrency) | |
| async def with_concurrency_limit(coroutine: Coroutine) -> Coroutine: | |
| async with semaphore: | |
| return await coroutine | |
| return [with_concurrency_limit(coroutine) for coroutine in coroutines] | |
| class Argument(Tap): | |
| model_name_or_path: str = "/mnt/jfs/ckpt/checkpoints/Qwen2.5-32B-Instruct" | |
| temperature: float = 0.2 | |
| max_completion_tokens: int = 10000 | |
| concurrency: int = 100 | |
| async def main(args: Argument, base_url: str, api_key: str = "sglang"): | |
| dataset = [ | |
| "1+1=?", | |
| "Where is the capital of France?", | |
| ] | |
| client = AsyncOpenAI(base_url=base_url, api_key=api_key) | |
| tasks = [ | |
| client.chat.completions.create( | |
| model=args.model_name_or_path, | |
| messages=[{"role": "user", "content": data}], | |
| temperature=args.temperature, | |
| max_tokens=args.max_completion_tokens, | |
| ) | |
| for data in dataset | |
| ] | |
| tasks = limit_concurrency(tasks, args.concurrency) | |
| async for response in tqdm( | |
| asyncio.as_completed(tasks), total=len(tasks), desc="Running" | |
| ): | |
| response = await response | |
| result = response.choices[0].message.content | |
| # print result, or save to file | |
| print(result) | |
| if __name__ == "__main__": | |
| args = Argument().parse_args() | |
| os.environ["SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN"] = "1" | |
| server_process, port = launch_server_cmd( | |
| ( | |
| "python3 -m sglang.launch_server " | |
| "--tp 8 " | |
| "--dp 1 " | |
| f"--model-path {args.model_name_or_path} " | |
| # f"--served-model-name {args.model} " | |
| # "--reasoning-parser qwen3 " | |
| "--context-length 16000 " | |
| # """--json-model-override-args {"rope_scaling":{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}} """ | |
| "--host 0.0.0.0 " | |
| "--port 33333 " | |
| "--log-level warning " | |
| ) | |
| ) | |
| wait_for_server(f"http://localhost:{port}") | |
| asyncio.run(main(args, base_url=f"http://localhost:{port}/v1", api_key="sglang")) | |
| terminate_process(server_process) | |
Xet Storage Details
- Size:
- 2.5 kB
- Xet hash:
- fb937afe77144b3ca5cc699e69eca2a191699b3c067eb54e879d08c5c156e11a
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.