import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "AlexKitipov/Phi-3-mini-128k-instruct" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, device_map="auto" ) SYSTEM_PROMPT = "You are a helpful AI assistant." def build_prompt(history, user_message): messages = [{"role": "system", "content": SYSTEM_PROMPT}] for user, assistant in history: if user: messages.append({"role": "user", "content": user}) if assistant: messages.append({"role": "assistant", "content": assistant}) messages.append({"role": "user", "content": user_message}) if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template: return tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # fallback formatting prompt = SYSTEM_PROMPT + "\n" for m in messages: role = m["role"].upper() prompt += f"{role}: {m['content']}\n" prompt += "ASSISTANT:" return prompt def chat_fn(message, history): prompt = build_prompt(history, message) inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): output = model.generate( **inputs, max_new_tokens=512, temperature=0.7, top_p=0.9, do_sample=True, pad_token_id=tokenizer.eos_token_id ) generated = tokenizer.decode( output[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True ) return generated demo = gr.ChatInterface( fn=chat_fn, title="Phi-3-mini-128k Chat", description="Chat with the Phi-3-mini-128k-instruct model." ) if __name__ == "__main__": demo.launch()