Spaces:
Running
Running
| import gradio as gr | |
| from vllm import LLM, SamplingParams | |
| llm = LLM( | |
| model="stepfun-ai/Step-Audio-2-mini-Think", # 修改为你需要的模型 | |
| trust_remote_code=True, | |
| tensor_parallel_size=2, # 如果有多张GPU,设置并行数量 | |
| # gpu_memory_utilization=0.9, # GPU显存利用率 | |
| max_model_len=8192, | |
| ) | |
| def respond( | |
| message, | |
| history: list[dict[str, str]], | |
| system_message, | |
| max_tokens, | |
| temperature, | |
| top_p, | |
| hf_token: gr.OAuthToken, | |
| ): | |
| """ | |
| 使用 vllm 在本地进行推理 | |
| """ | |
| # 构建对话消息 | |
| messages = [{"role": "system", "content": system_message}] | |
| messages.extend(history) | |
| messages.append({"role": "user", "content": message}) | |
| # 设置采样参数 | |
| sampling_params = SamplingParams( | |
| temperature=temperature, | |
| top_p=top_p, | |
| max_tokens=max_tokens, | |
| ) | |
| # 使用 vllm 的 chat 接口进行推理 | |
| outputs = llm.chat( | |
| messages=messages, | |
| sampling_params=sampling_params, | |
| use_tqdm=False, | |
| ) | |
| # 获取生成的文本 | |
| response = outputs[0].outputs[0].text | |
| # 模拟流式输出效果(逐字符yield) | |
| for i in range(1, len(response) + 1): | |
| yield response[:i] | |
| """ | |
| For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface | |
| """ | |
| chatbot = gr.ChatInterface( | |
| respond, | |
| type="messages", | |
| additional_inputs=[ | |
| gr.Textbox(value="You are a friendly Chatbot.", label="System message"), | |
| gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), | |
| gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
| gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.95, | |
| step=0.05, | |
| label="Top-p (nucleus sampling)", | |
| ), | |
| ], | |
| ) | |
| with gr.Blocks() as demo: | |
| with gr.Sidebar(): | |
| gr.LoginButton() | |
| chatbot.render() | |
| if __name__ == "__main__": | |
| demo.launch() | |