import gradio as gr from vllm import LLM, SamplingParams llm = LLM( model="stepfun-ai/Step-Audio-2-mini-Think", # 修改为你需要的模型 trust_remote_code=True, tensor_parallel_size=2, # 如果有多张GPU,设置并行数量 # gpu_memory_utilization=0.9, # GPU显存利用率 max_model_len=8192, ) def respond( message, history: list[dict[str, str]], system_message, max_tokens, temperature, top_p, hf_token: gr.OAuthToken, ): """ 使用 vllm 在本地进行推理 """ # 构建对话消息 messages = [{"role": "system", "content": system_message}] messages.extend(history) messages.append({"role": "user", "content": message}) # 设置采样参数 sampling_params = SamplingParams( temperature=temperature, top_p=top_p, max_tokens=max_tokens, ) # 使用 vllm 的 chat 接口进行推理 outputs = llm.chat( messages=messages, sampling_params=sampling_params, use_tqdm=False, ) # 获取生成的文本 response = outputs[0].outputs[0].text # 模拟流式输出效果(逐字符yield) for i in range(1, len(response) + 1): yield response[:i] """ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface """ chatbot = gr.ChatInterface( respond, type="messages", additional_inputs=[ gr.Textbox(value="You are a friendly Chatbot.", label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", ), ], ) with gr.Blocks() as demo: with gr.Sidebar(): gr.LoginButton() chatbot.render() if __name__ == "__main__": demo.launch()