import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline # ✅ Use a model that works on CPU model_id = "microsoft/phi-2" # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float32, device_map="auto" ) # Create generation pipeline generator = pipeline( "text-generation", model=model, tokenizer=tokenizer, do_sample=True, temperature=0.7 ) # ✅ Correct format: return dict with response key def chat_fn(message, history): # Optional: recreate full prompt from history prompt = "" for turn in history: prompt += f"<|user|>\n{turn['content']}\n<|assistant|>\n{turn['response']}\n" prompt += f"<|user|>\n{message}\n<|assistant|>\n" output = generator(prompt, max_new_tokens=256)[0]["generated_text"] reply = output.replace(prompt, "").strip() return {"response": reply} # ✅ Gradio app: ChatInterface uses type="messages" by default chatbot_ui = gr.ChatInterface( fn=chat_fn, title="Phi-2 Chatbot", theme="default", ) if __name__ == "__main__": chatbot_ui.launch()