import gradio as gr import spaces import torch from transformers import AutoModelForCausalLM, AutoTokenizer # 只加载一次模型和分词器 MODEL_NAME = "inclusionAI/Ring-mini-2.0" device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.float16 if device == "cuda" else torch.float32, trust_remote_code=True ).to(device) @spaces.GPU def respond( message, history: list[dict[str, str]], system_message, max_tokens, temperature, top_p, hf_token: gr.OAuthToken = None, # 保持参数兼容 ): """ 使用 transformers 在 GPU 上本地推理 inclusionAI/Ring-mini-2.0 """ # 拼接历史和 system prompt,兼容 gradio ChatInterface 的消息格式 prompt = system_message + "\n" # gradio history: [{"role": "user"/"assistant", "content": "..."}, ...] last_role = None for turn in history: if turn.get("role") == "user": prompt += f"User: {turn['content']}\n" last_role = "user" elif turn.get("role") == "assistant": prompt += f"Assistant: {turn['content']}\n" last_role = "assistant" prompt += f"User: {message}\nAssistant:" input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device) output_ids = model.generate( input_ids, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, do_sample=True, pad_token_id=tokenizer.eos_token_id, ) output = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True) # 流式输出 response = "" for token in output.split(): response += token + " " yield response.strip() """ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface """ chatbot = gr.ChatInterface( respond, type="messages", additional_inputs=[ gr.Textbox(value="You are a friendly Chatbot.", label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", ), ], ) with gr.Blocks() as demo: gr.Markdown("# HuggingFace Running") with gr.Sidebar(): gr.LoginButton() chatbot.render() if __name__ == "__main__": demo.launch()