import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from threading import Thread import spaces # --- 模型加载 --- # 使用 "auto" 模式加载模型和分词器,Hugging Face Accelerate 会自动处理设备和精度 MODEL_NAME = "inclusionAI/Ring-mini-2.0" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype="auto", device_map="auto", trust_remote_code=True ) @spaces.GPU(duration=120) def generate_response(message, history): # Convert history to messages format messages = [ {"role": "system", "content": "You are Ring, an assistant created by inclusionAI"} ] # Add conversation history for human, assistant in history: messages.append({"role": "user", "content": human}) messages.append({"role": "assistant", "content": assistant}) # Add current message messages.append({"role": "user", "content": message}) # Apply chat template text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Tokenize input model_inputs = tokenizer([text], return_tensors="pt", return_token_type_ids=False).to(model.device) # Generate response with streaming streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True) generation_kwargs = dict( **model_inputs, max_new_tokens=8192, temperature=0.7, do_sample=True, streamer=streamer, ) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() # Stream the response response = "" for new_text in streamer: response += new_text yield response thread.join()