Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| MODEL_ID = "AlexKitipov/Phi-3-mini-128k-instruct" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, | |
| device_map="auto" | |
| ) | |
| SYSTEM_PROMPT = "You are a helpful AI assistant." | |
| def build_prompt(history, user_message): | |
| messages = [{"role": "system", "content": SYSTEM_PROMPT}] | |
| for user, assistant in history: | |
| if user: | |
| messages.append({"role": "user", "content": user}) | |
| if assistant: | |
| messages.append({"role": "assistant", "content": assistant}) | |
| messages.append({"role": "user", "content": user_message}) | |
| if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template: | |
| return tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| # fallback formatting | |
| prompt = SYSTEM_PROMPT + "\n" | |
| for m in messages: | |
| role = m["role"].upper() | |
| prompt += f"{role}: {m['content']}\n" | |
| prompt += "ASSISTANT:" | |
| return prompt | |
| def chat_fn(message, history): | |
| prompt = build_prompt(history, message) | |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
| with torch.no_grad(): | |
| output = model.generate( | |
| **inputs, | |
| max_new_tokens=512, | |
| temperature=0.7, | |
| top_p=0.9, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id | |
| ) | |
| generated = tokenizer.decode( | |
| output[0][inputs["input_ids"].shape[-1]:], | |
| skip_special_tokens=True | |
| ) | |
| return generated | |
| demo = gr.ChatInterface( | |
| fn=chat_fn, | |
| title="Phi-3-mini-128k Chat", | |
| description="Chat with the Phi-3-mini-128k-instruct model." | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |