import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch import accelerate model_id = "pradipraut737/testmodel" # Load model & tokenizer tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 ) def generate_reply(user_input, history): prompt = "<|user|>\n" + user_input.strip() + "\n<|assistant|>\n" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate( **inputs, max_new_tokens=256, temperature=0.7, top_k=50, top_p=0.9, pad_token_id=tokenizer.eos_token_id ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) reply = response.split("<|assistant|>")[-1].strip() history.append((user_input, reply)) return history, history # Gradio UI with gr.Blocks() as demo: gr.Markdown("See my Open-Source Chatbot") chatbot = gr.Chatbot() msg = gr.Textbox(label="Type a question and press Enter") state = gr.State([]) msg.submit(generate_reply, [msg, state], [chatbot, state]) gr.Button("Clear").click(lambda: ([], []), None, [chatbot, state]) demo.launch()