import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline # Load Falcon-RW-1B model_name = "tiiuae/falcon-rw-1b" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1) chat_history = [] MAX_HISTORY = 10 # Optional: to limit memory growth def generate_reply(message): global chat_history chat_history.append(f"User: {message}") prompt = "\n".join(chat_history) + "\nBot:" result = generator(prompt, max_new_tokens=100, do_sample=True, pad_token_id=tokenizer.eos_token_id) generated = result[0]["generated_text"] reply = generated[len(prompt):].split("User:")[0].strip() chat_history.append(f"Bot: {reply}") chat_history[:] = chat_history[-MAX_HISTORY:] # Trim history return reply with gr.Blocks() as demo: txt = gr.Textbox(label="You", placeholder="Type your message here...") out = gr.Textbox(label="Bot") txt.submit(generate_reply, inputs=txt, outputs=out).api_name = "generate_reply" demo.queue() demo.launch(share=True, show_api=True, show_error=True)