| """ |
| Usage: |
| python3 offline_batch_inference.py |
| """ |
|
|
| from urllib.request import urlopen |
|
|
| import sglang as sgl |
|
|
|
|
| def load_prompt() -> str: |
| |
| |
| |
| |
| |
| |
|
|
| with urlopen( |
| "https://qianwen-res.oss-cn-beijing.aliyuncs.com" |
| "/Qwen2.5-1M/test-data/64k.txt", |
| timeout=5, |
| ) as response: |
| prompt = response.read().decode("utf-8") |
| return prompt |
|
|
|
|
| |
| def process_requests(llm: sgl.Engine, prompts: list[str]) -> None: |
| |
| sampling_params = { |
| "temperature": 0.7, |
| "top_p": 0.8, |
| "top_k": 20, |
| "repetition_penalty": 1.05, |
| "max_new_tokens": 256, |
| } |
| |
| outputs = llm.generate(prompts, sampling_params) |
| |
| for output in outputs: |
| prompt_token_ids = output["meta_info"]["prompt_tokens"] |
| generated_text = output["text"] |
| print( |
| f"Prompt length: {prompt_token_ids}, " f"Generated text: {generated_text!r}" |
| ) |
|
|
|
|
| |
| def initialize_engine() -> sgl.Engine: |
| llm = sgl.Engine( |
| model_path="Qwen/Qwen2.5-7B-Instruct-1M", |
| context_length=1048576, |
| page_size=256, |
| attention_backend="dual_chunk_flash_attn", |
| tp_size=4, |
| disable_radix_cache=True, |
| enable_mixed_chunk=False, |
| enable_torch_compile=False, |
| chunked_prefill_size=131072, |
| mem_fraction_static=0.6, |
| log_level="DEBUG", |
| ) |
| return llm |
|
|
|
|
| def main(): |
| llm = initialize_engine() |
| prompt = load_prompt() |
| process_requests(llm, [prompt]) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|