import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer


# Load a small conversational model on CPU
MODEL_NAME = "microsoft/DialoGPT-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="cpu")

def chat(message, history):
    """
    Appends the user message to the conversation history,
    generates a response from the model, and returns the updated history.
    """
    if history is None:
        history = []
    # Build the conversation prompt by joining previous turns.
    prompt = ""
    for speaker, utterance in history:
        prompt += f"{speaker}: {utterance}\n"
    prompt += f"User: {message}\nBot: "
    
    # Encode the prompt and generate a response (limit max new tokens for CPU speed)
    input_ids = tokenizer.encode(prompt + tokenizer.eos_token, return_tensors="pt")
    output_ids = model.generate(
        input_ids, 
        max_length=input_ids.shape[1] + 50, 
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_p=0.95,
        top_k=50
    )
    # Decode only the newly generated tokens.
    response = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)
    
    # Update history and return an empty message (to clear the input box)
    history.append(("User", message))
    history.append(("Bot", response))
    return "", history

# Build the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# CPU LLM Chat Demo\nThis is a simple chat interface using DialoGPT-small.")
    chatbot = gr.Chatbot()
    message_input = gr.Textbox(placeholder="Type your message here...", show_label=False)
    state = gr.State([])
    message_input.submit(chat, [message_input, state], [message_input, chatbot])

demo.launch()