import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch # Device device = "cuda" if torch.cuda.is_available() else "cpu" # Load Model model_name = "Qwen/Qwen3-0.6B" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto" ) # Chat function def chat_with_model(user_input, history=[]): # Combine history into context context = "" for h in history: context += f"User: {h[0]}\nAssistant: {h[1]}\n" context += f"User: {user_input}\nAssistant:" inputs = tokenizer(context, return_tensors="pt").to(device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=256, temperature=0.7, top_p=0.9, do_sample=True, pad_token_id=tokenizer.eos_token_id ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract only assistant's last reply if "Assistant:" in response: reply = response.split("Assistant:")[-1].strip() else: reply = response.strip() history.append((user_input, reply)) return history, history # Gradio Chatbot UI with gr.Blocks() as demo: gr.Markdown("# 🤖 Hyprlyf/hypr1-instruct Chatbot") chatbot = gr.Chatbot() msg = gr.Textbox(placeholder="Type your message here...") clear = gr.Button("Clear") state = gr.State([]) def respond(message, state): state, updated_history = chat_with_model(message, state) return updated_history, state msg.submit(respond, [msg, state], [chatbot, state]) clear.click(lambda: ([], []), None, [chatbot, state]) demo.launch(share=True)