leideng/QCFuse / srt /warmup.py
leideng's picture
download
raw
1.83 kB
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, List
import numpy as np
import tqdm
from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST
from sglang.srt.managers.io_struct import GenerateReqInput
if TYPE_CHECKING:
from sglang.srt.managers.tokenizer_manager import TokenizerManager
logger = logging.getLogger(__file__)
_warmup_registry = {}
def warmup(name: str):
def decorator(fn):
_warmup_registry[name] = fn
return fn
return decorator
async def execute_warmups(
disaggregation_mode: str,
warmup_names: List[str],
tokenizer_manager: TokenizerManager,
):
for warmup_name in warmup_names:
if warmup_name not in _warmup_registry:
logger.warning(f"Could not find custom warmup {warmup_name}")
continue
logger.info(f"Running warmup {warmup_name}")
await _warmup_registry[warmup_name](disaggregation_mode, tokenizer_manager)
@warmup("voice_chat")
async def voice_chat(disaggregation_mode: str, tokenizer_manager: TokenizerManager):
# this warms up the fused_moe triton kernels and caches them
# if we don't do this we break real time inference for voice chat
for i in tqdm.trange(1, 512):
size = i * 4
generate_req_input = GenerateReqInput(
input_ids=(np.random.randint(2**16, size=[size])).tolist(),
sampling_params={
"max_new_tokens": 30,
"temperature": 0.8,
"stop_token_ids": [1],
"min_p": 0.0,
},
)
if disaggregation_mode != "null":
generate_req_input.bootstrap_room = 0
generate_req_input.bootstrap_host = FAKE_BOOTSTRAP_HOST
await tokenizer_manager.generate_request(generate_req_input, None).__anext__()

Xet Storage Details

Size:
1.83 kB
·
Xet hash:
36f0ab7d59c42920c78560a826c707d6b3bbe816c0d742a6a2fd3ba13b09e4c7

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.