import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Load Falcon-RW-1B
model_name = "tiiuae/falcon-rw-1b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)

chat_history = []
MAX_HISTORY = 10  # Optional: to limit memory growth

def generate_reply(message):
    global chat_history
    chat_history.append(f"User: {message}")
    prompt = "\n".join(chat_history) + "\nBot:"

    result = generator(prompt, max_new_tokens=100, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    generated = result[0]["generated_text"]
    reply = generated[len(prompt):].split("User:")[0].strip()

    chat_history.append(f"Bot: {reply}")
    chat_history[:] = chat_history[-MAX_HISTORY:]  # Trim history
    return reply

with gr.Blocks() as demo:
    txt = gr.Textbox(label="You", placeholder="Type your message here...")
    out = gr.Textbox(label="Bot")

    txt.submit(generate_reply, inputs=txt, outputs=out).api_name = "generate_reply"

demo.queue()
demo.launch(share=True, show_api=True, show_error=True)