#!/usr/bin/env python3 import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer # Load model and tokenizer print("Loading model...") device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = AutoTokenizer.from_pretrained(".", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( ".", torch_dtype=torch.float16 if device == "cuda" else torch.float32, device_map=device, trust_remote_code=True ) print("✓ Model loaded!") def chat(message, max_tokens, temperature): """Generate response from model""" inputs = tokenizer(message, return_tensors="pt") inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, top_p=0.9, do_sample=True ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return response # Create Gradio interface demo = gr.Interface( fn=chat, inputs=[ gr.Textbox(label="Message", placeholder="Ask me anything..."), gr.Slider(minimum=10, maximum=1024, value=512, step=1, label="Max Tokens"), gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"), ], outputs=gr.Textbox(label="Response"), title="Zenith Copilot", description="Chat with your deployed model", ) if __name__ == "__main__": demo.launch()