| from __future__ import annotations | |
| import logging | |
| from typing import TYPE_CHECKING, List | |
| import numpy as np | |
| import tqdm | |
| from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST | |
| from sglang.srt.managers.io_struct import GenerateReqInput | |
| if TYPE_CHECKING: | |
| from sglang.srt.managers.tokenizer_manager import TokenizerManager | |
| logger = logging.getLogger(__file__) | |
| _warmup_registry = {} | |
| def warmup(name: str): | |
| def decorator(fn): | |
| _warmup_registry[name] = fn | |
| return fn | |
| return decorator | |
| async def execute_warmups( | |
| disaggregation_mode: str, | |
| warmup_names: List[str], | |
| tokenizer_manager: TokenizerManager, | |
| ): | |
| for warmup_name in warmup_names: | |
| if warmup_name not in _warmup_registry: | |
| logger.warning(f"Could not find custom warmup {warmup_name}") | |
| continue | |
| logger.info(f"Running warmup {warmup_name}") | |
| await _warmup_registry[warmup_name](disaggregation_mode, tokenizer_manager) | |
| async def voice_chat(disaggregation_mode: str, tokenizer_manager: TokenizerManager): | |
| # this warms up the fused_moe triton kernels and caches them | |
| # if we don't do this we break real time inference for voice chat | |
| for i in tqdm.trange(1, 512): | |
| size = i * 4 | |
| generate_req_input = GenerateReqInput( | |
| input_ids=(np.random.randint(2**16, size=[size])).tolist(), | |
| sampling_params={ | |
| "max_new_tokens": 30, | |
| "temperature": 0.8, | |
| "stop_token_ids": [1], | |
| "min_p": 0.0, | |
| }, | |
| ) | |
| if disaggregation_mode != "null": | |
| generate_req_input.bootstrap_room = 0 | |
| generate_req_input.bootstrap_host = FAKE_BOOTSTRAP_HOST | |
| await tokenizer_manager.generate_request(generate_req_input, None).__anext__() | |
Xet Storage Details
- Size:
- 1.83 kB
- Xet hash:
- 36f0ab7d59c42920c78560a826c707d6b3bbe816c0d742a6a2fd3ba13b09e4c7
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.