import os
import torch
import gradio as gr

from threading import Thread
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TextIteratorStreamer,
)

# -------------------------------------------------------
# Model Settings
# -------------------------------------------------------
MODEL_ID = "tiiuae/Falcon3-1B-Instruct"

SYSTEM_PROMPT = """
You are a helpful, clear, friendly AI assistant.
Answer in a practical way with examples when helpful.
"""

# -------------------------------------------------------
# Load Model
# -------------------------------------------------------
print(f"Loading model: {MODEL_ID}")

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

if torch.cuda.is_available():
    dtype = torch.bfloat16
    device_map = "auto"
else:
    dtype = torch.float32
    device_map = None

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=dtype,
    device_map=device_map,
)

if not torch.cuda.is_available():
    model = model.to("cpu")

model.eval()

print("Model loaded successfully.")


# -------------------------------------------------------
# Chat Function
# -------------------------------------------------------
def chat_with_falcon(
    message,
    history,
    max_new_tokens,
    temperature,
    top_p,
    repetition_penalty,
):
    """
    message: Current user message
    history: Gradio messages-style chat history
    """

    messages = [{"role": "system", "content": SYSTEM_PROMPT.strip()}]

    for item in history:
        if item["role"] in ["user", "assistant"]:
            messages.append(
                {
                    "role": item["role"],
                    "content": item["content"],
                }
            )

    messages.append({"role": "user", "content": message})

    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    inputs = tokenizer(prompt, return_tensors="pt")

    if torch.cuda.is_available():
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
    else:
        inputs = {k: v.to("cpu") for k, v in inputs.items()}

    streamer = TextIteratorStreamer(
        tokenizer,
        skip_prompt=True,
        skip_special_tokens=True,
    )

    generation_kwargs = dict(
        **inputs,
        streamer=streamer,
        max_new_tokens=int(max_new_tokens),
        temperature=float(temperature),
        top_p=float(top_p),
        repetition_penalty=float(repetition_penalty),
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )

    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    partial_response = ""

    for new_text in streamer:
        partial_response += new_text
        yield partial_response


# -------------------------------------------------------
# Gradio Interface
# -------------------------------------------------------
with gr.Blocks(title="Falcon3-1B-Instruct Chat") as demo:
    gr.Markdown(
        """
        # 🦅 Falcon3-1B-Instruct Chat Interface

        This app runs a local Hugging Face Transformers chat interface using:

        `tiiuae/Falcon3-1B-Instruct`

        Use this to test instruction-following, tutoring, coding help, short explanations, and multilingual chat.
        """
    )

    chatbot = gr.Chatbot(
        label="Falcon3 Chat",
        type="messages",
        height=500,
    )

    with gr.Row():
        textbox = gr.Textbox(
            placeholder="Ask Falcon3 something...",
            label="Your Message",
            scale=5,
        )
        submit_btn = gr.Button("Send", variant="primary", scale=1)

    with gr.Accordion("Generation Settings", open=False):
        max_new_tokens = gr.Slider(
            minimum=64,
            maximum=2048,
            value=512,
            step=64,
            label="Max New Tokens",
        )

        temperature = gr.Slider(
            minimum=0.1,
            maximum=1.5,
            value=0.7,
            step=0.1,
            label="Temperature",
        )

        top_p = gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.9,
            step=0.05,
            label="Top-p",
        )

        repetition_penalty = gr.Slider(
            minimum=1.0,
            maximum=1.5,
            value=1.1,
            step=0.05,
            label="Repetition Penalty",
        )

    clear_btn = gr.Button("Clear Chat")

    def user_turn(user_message, chat_history):
        if chat_history is None:
            chat_history = []

        chat_history.append({"role": "user", "content": user_message})
        return "", chat_history

    def bot_turn(chat_history, max_new_tokens, temperature, top_p, repetition_penalty):
        user_message = chat_history[-1]["content"]
        prior_history = chat_history[:-1]

        partial = ""

        for partial in chat_with_falcon(
            user_message,
            prior_history,
            max_new_tokens,
            temperature,
            top_p,
            repetition_penalty,
        ):
            updated_history = prior_history + [
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": partial},
            ]
            yield updated_history

    submit_btn.click(
        fn=user_turn,
        inputs=[textbox, chatbot],
        outputs=[textbox, chatbot],
        queue=False,
    ).then(
        fn=bot_turn,
        inputs=[
            chatbot,
            max_new_tokens,
            temperature,
            top_p,
            repetition_penalty,
        ],
        outputs=chatbot,
    )

    textbox.submit(
        fn=user_turn,
        inputs=[textbox, chatbot],
        outputs=[textbox, chatbot],
        queue=False,
    ).then(
        fn=bot_turn,
        inputs=[
            chatbot,
            max_new_tokens,
            temperature,
            top_p,
            repetition_penalty,
        ],
        outputs=chatbot,
    )

    clear_btn.click(lambda: [], outputs=chatbot)

demo.queue()
demo.launch()