import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float32
)


def build_prompt(message, history):
    prompt = ""
    for user_msg, bot_msg in history:
        prompt += f"<|user|>\n{user_msg}\n<|assistant|>\n{bot_msg}\n"
    prompt += f"<|user|>\n{message}\n<|assistant|>\n"
    return prompt


def chat(message, history):
    prompt = build_prompt(message, history)

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id
        )

    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = full_text[len(tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)):].strip()

    if not answer:
        answer = "I am here. Ask me something."

    return answer


demo = gr.ChatInterface(
    fn=chat,
    title="My Local LLM Chat",
    description="TinyLlama chatbot running locally without HF_TOKEN",
    examples=[
        "Hello",
        "Who are you?",
        "Explain pain in simple words",
        "Write a short Python code"
    ]
)

if __name__ == "__main__":
    demo.launch()